From 678702573531f19ae36847a6a07257aaae623fbe Mon Sep 17 00:00:00 2001 From: Sadaf Ebrahimi Date: Fri, 25 Aug 2023 16:27:50 +0000 Subject: Move libyuv/files/ directly under libyuv Test: TreeHugger Merged-In: I773d1ae01539cc5d200768b526f10b2922567f72 Change-Id: I4ba1f1e781d7fd3ad96639dfdc08f654e45ae3d3 --- source/compare.cc | 430 ++ source/compare_common.cc | 74 + source/compare_gcc.cc | 359 ++ source/compare_msa.cc | 97 + source/compare_neon.cc | 96 + source/compare_neon64.cc | 94 + source/compare_win.cc | 241 ++ source/convert.cc | 4000 ++++++++++++++++++ source/convert_argb.cc | 8277 ++++++++++++++++++++++++++++++++++++ source/convert_from.cc | 882 ++++ source/convert_from_argb.cc | 3284 +++++++++++++++ source/convert_jpeg.cc | 602 +++ source/convert_to_argb.cc | 382 ++ source/convert_to_i420.cc | 280 ++ source/cpu_id.cc | 376 ++ source/mjpeg_decoder.cc | 581 +++ source/mjpeg_validate.cc | 71 + source/planar_functions.cc | 5715 +++++++++++++++++++++++++ source/rotate.cc | 1196 ++++++ source/rotate_any.cc | 79 + source/rotate_argb.cc | 257 ++ source/rotate_common.cc | 229 + source/rotate_gcc.cc | 503 +++ source/rotate_lsx.cc | 243 ++ source/rotate_msa.cc | 250 ++ source/rotate_neon.cc | 458 ++ source/rotate_neon64.cc | 482 +++ source/rotate_win.cc | 253 ++ source/row_any.cc | 2459 +++++++++++ source/row_common.cc | 4547 ++++++++++++++++++++ source/row_gcc.cc | 9731 +++++++++++++++++++++++++++++++++++++++++++ source/row_lasx.cc | 2304 ++++++++++ source/row_lsx.cc | 2987 +++++++++++++ source/row_msa.cc | 3597 ++++++++++++++++ source/row_neon.cc | 3957 ++++++++++++++++++ source/row_neon64.cc | 4509 ++++++++++++++++++++ source/row_rvv.cc | 956 +++++ source/row_win.cc | 6440 ++++++++++++++++++++++++++++ source/scale.cc | 2592 ++++++++++++ source/scale_any.cc | 1078 +++++ source/scale_argb.cc | 1157 +++++ source/scale_common.cc | 1999 +++++++++ source/scale_gcc.cc | 2953 +++++++++++++ source/scale_lsx.cc | 739 ++++ source/scale_msa.cc | 949 +++++ source/scale_neon.cc | 1533 +++++++ source/scale_neon64.cc | 1673 ++++++++ source/scale_rgb.cc | 66 + source/scale_uv.cc | 1171 ++++++ source/scale_win.cc | 1392 +++++++ source/test.sh | 35 + source/video_common.cc | 62 + 52 files changed, 88677 insertions(+) create mode 100644 source/compare.cc create mode 100644 source/compare_common.cc create mode 100644 source/compare_gcc.cc create mode 100644 source/compare_msa.cc create mode 100644 source/compare_neon.cc create mode 100644 source/compare_neon64.cc create mode 100644 source/compare_win.cc create mode 100644 source/convert.cc create mode 100644 source/convert_argb.cc create mode 100644 source/convert_from.cc create mode 100644 source/convert_from_argb.cc create mode 100644 source/convert_jpeg.cc create mode 100644 source/convert_to_argb.cc create mode 100644 source/convert_to_i420.cc create mode 100644 source/cpu_id.cc create mode 100644 source/mjpeg_decoder.cc create mode 100644 source/mjpeg_validate.cc create mode 100644 source/planar_functions.cc create mode 100644 source/rotate.cc create mode 100644 source/rotate_any.cc create mode 100644 source/rotate_argb.cc create mode 100644 source/rotate_common.cc create mode 100644 source/rotate_gcc.cc create mode 100644 source/rotate_lsx.cc create mode 100644 source/rotate_msa.cc create mode 100644 source/rotate_neon.cc create mode 100644 source/rotate_neon64.cc create mode 100644 source/rotate_win.cc create mode 100644 source/row_any.cc create mode 100644 source/row_common.cc create mode 100644 source/row_gcc.cc create mode 100644 source/row_lasx.cc create mode 100644 source/row_lsx.cc create mode 100644 source/row_msa.cc create mode 100644 source/row_neon.cc create mode 100644 source/row_neon64.cc create mode 100644 source/row_rvv.cc create mode 100644 source/row_win.cc create mode 100644 source/scale.cc create mode 100644 source/scale_any.cc create mode 100644 source/scale_argb.cc create mode 100644 source/scale_common.cc create mode 100644 source/scale_gcc.cc create mode 100644 source/scale_lsx.cc create mode 100644 source/scale_msa.cc create mode 100644 source/scale_neon.cc create mode 100644 source/scale_neon64.cc create mode 100644 source/scale_rgb.cc create mode 100644 source/scale_uv.cc create mode 100644 source/scale_win.cc create mode 100755 source/test.sh create mode 100644 source/video_common.cc (limited to 'source') diff --git a/source/compare.cc b/source/compare.cc new file mode 100644 index 00000000..50a736bd --- /dev/null +++ b/source/compare.cc @@ -0,0 +1,430 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/compare.h" + +#include +#include +#ifdef _OPENMP +#include +#endif + +#include "libyuv/basic_types.h" +#include "libyuv/compare_row.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// hash seed of 5381 recommended. +LIBYUV_API +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { + const int kBlockSize = 1 << 15; // 32768; + int remainder; + uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = + HashDjb2_C; +#if defined(HAS_HASHDJB2_SSE41) + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; + } +#endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif + + while (count >= (uint64_t)kBlockSize) { + seed = HashDjb2_SSE(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + remainder = (int)count & ~15; + if (remainder) { + seed = HashDjb2_SSE(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = (int)count & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. + return FOURCC_BGRA; + } + if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255. + return FOURCC_ARGB; + } + argb += 8; + } + if (width & 1) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + } + return 0; +} + +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height) { + uint32_t fourcc = 0; + int h; + + // Coalesce rows. + if (stride_argb == width * 4) { + width *= height; + height = 1; + stride_argb = 0; + } + for (h = 0; h < height && fourcc == 0; ++h) { + fourcc = ARGBDetectRow_C(argb, width); + argb += stride_argb; + } + return fourcc; +} + +// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. +// So actual maximum is 1 less loop, which is 64436 - 32 bytes. + +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + const int kBlockSize = 1 << 15; // 32768; + const int kSimdSize = 64; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); + uint64_t diff = 0; + int i; + uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, + int count) = HammingDistance_C; +#if defined(HAS_HAMMINGDISTANCE_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HammingDistance = HammingDistance_NEON; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HammingDistance = HammingDistance_AVX2; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif + +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : diff) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + diff += HammingDistance(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + diff += HammingDistance(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & (kSimdSize - 1); + if (remainder) { + diff += HammingDistance_C(src_a, src_b, remainder); + } + return diff; +} + +// TODO(fbarchard): Refactor into row function. +LIBYUV_API +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + // SumSquareError returns values 0 to 65535 for each squared difference. + // Up to 65536 of those can be summed and remain within a uint32_t. + // After each block of 65536 pixels, accumulate into a uint64_t. + const int kBlockSize = 65536; + int remainder = count & (kBlockSize - 1) & ~31; + uint64_t sse = 0; + int i; + uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, + int count) = SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + // Note only used for multiples of 16 so count is not checked. + SumSquareError = SumSquareError_SSE2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // Note only used for multiples of 32 so count is not checked. + SumSquareError = SumSquareError_AVX2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sse) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + sse += SumSquareError(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 31; + if (remainder) { + sse += SumSquareError_C(src_a, src_b, remainder); + } + return sse; +} + +LIBYUV_API +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + uint64_t sse = 0; + int h; + // Coalesce rows. + if (stride_a == width && stride_b == width) { + width *= height; + height = 1; + stride_a = stride_b = 0; + } + for (h = 0; h < height; ++h) { + sse += ComputeSumSquareError(src_a, src_b, width); + src_a += stride_a; + src_b += stride_b; + } + return sse; +} + +LIBYUV_API +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { + double psnr; + if (sse > 0) { + double mse = (double)count / (double)sse; + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) { + psnr = kMaxPsnr; + } + + return psnr; +} + +LIBYUV_API +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + const uint64_t samples = (uint64_t)width * (uint64_t)height; + const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, + stride_b, width, height); + return SumSquareErrorToPsnr(sse, samples); +} + +LIBYUV_API +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const uint64_t sse_y = ComputeSumSquareErrorPlane( + src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const uint64_t sse_u = ComputeSumSquareErrorPlane( + src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); + const uint64_t sse_v = ComputeSumSquareErrorPlane( + src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); + const uint64_t samples = (uint64_t)width * (uint64_t)height + + 2 * ((uint64_t)width_uv * (uint64_t)height_uv); + const uint64_t sse = sse_y + sse_u + sse_v; + return SumSquareErrorToPsnr(sse, samples); +} + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 + +static double Ssim8x8_C(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b) { + int64_t sum_a = 0; + int64_t sum_b = 0; + int64_t sum_sq_a = 0; + int64_t sum_sq_b = 0; + int64_t sum_axb = 0; + + int i; + for (i = 0; i < 8; ++i) { + int j; + for (j = 0; j < 8; ++j) { + sum_a += src_a[j]; + sum_b += src_b[j]; + sum_sq_a += src_a[j] * src_a[j]; + sum_sq_b += src_b[j] * src_b[j]; + sum_axb += src_a[j] * src_b[j]; + } + + src_a += stride_a; + src_b += stride_b; + } + + { + const int64_t count = 64; + // scale the constants by number of pixels + const int64_t c1 = (cc1 * count * count) >> 12; + const int64_t c2 = (cc2 * count * count) >> 12; + + const int64_t sum_a_x_sum_b = sum_a * sum_b; + + const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + + const int64_t sum_a_sq = sum_a * sum_a; + const int64_t sum_b_sq = sum_b * sum_b; + + const int64_t ssim_d = + (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); + + if (ssim_d == 0) { + return DBL_MAX; + } + return (double)ssim_n / (double)ssim_d; + } +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +LIBYUV_API +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + int samples = 0; + double ssim_total = 0; + double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, + int stride_b) = Ssim8x8_C; + + // sample point start with each 4x4 location + int i; + for (i = 0; i < height - 8; i += 4) { + int j; + for (j = 0; j < width - 8; j += 4) { + ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); + samples++; + } + + src_a += stride_a * 4; + src_b += stride_b * 4; + } + + ssim_total /= samples; + return ssim_total; +} + +LIBYUV_API +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const double ssim_y = + CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, + width_uv, height_uv); + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, + width_uv, height_uv); + return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/compare_common.cc b/source/compare_common.cc new file mode 100644 index 00000000..d1cab8d2 --- /dev/null +++ b/source/compare_common.cc @@ -0,0 +1,74 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Hakmem method for hamming distance. +uint32_t HammingDistance_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); + uint32_t u = x - ((x >> 1) & 0x55555555); + u = ((u >> 2) & 0x33333333) + (u & 0x33333333); + diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); + src_a += 4; + src_b += 4; + } + + for (; i < count; ++i) { + uint32_t x = *src_a ^ *src_b; + uint32_t u = x - ((x >> 1) & 0x55); + u = ((u >> 2) & 0x33) + (u & 0x33); + diff += (u + (u >> 4)) & 0x0f; + src_a += 1; + src_b += 1; + } + + return diff; +} + +uint32_t SumSquareError_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + for (i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += (uint32_t)(diff * diff); + } + return sse; +} + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash = seed; + int i; + for (i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc new file mode 100644 index 00000000..33cbe25d --- /dev/null +++ b/source/compare_gcc.cc @@ -0,0 +1,359 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(__x86_64__) +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint64_t diff = 0u; + + asm volatile( + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" + + // Process 32 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + + return (uint32_t)(diff); +} +#else +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "ecx", "edx"); + + return diff; +} +#endif + +static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; +static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32_t HammingDistance_SSSE3(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 +uint32_t HammingDistance_AVX2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "vbroadcastf128 %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + + return diff; +} +#endif // HAS_HAMMINGDISTANCE_AVX2 + +uint32_t SumSquareError_SSE2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + return sse; +} + +static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +static const uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static const uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static const uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static const uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash; + asm volatile( + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + return hash; +} +#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/compare_msa.cc b/source/compare_msa.cc new file mode 100644 index 00000000..0b807d37 --- /dev/null +++ b/source/compare_msa.cc @@ -0,0 +1,97 @@ +/* + * Copyright 2017 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32_t HammingDistance_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + int i; + v16u8 src0, src1, src2, src3; + v2i64 vec0 = {0}, vec1 = {0}; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + src0 ^= src2; + src1 ^= src3; + vec0 += __msa_pcnt_d((v2i64)src0); + vec1 += __msa_pcnt_d((v2i64)src1); + src_a += 32; + src_b += 32; + } + + vec0 += vec1; + diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); + diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); + return diff; +} + +uint32_t SumSquareError_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; + v2i64 tmp0; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); + reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); + reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); + reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); + src_a += 32; + src_b += 32; + } + + reg0 += reg1; + reg2 += reg3; + reg0 += reg2; + tmp0 = __msa_hadd_s_d(reg0, reg0); + sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); + sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); + return sse; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/source/compare_neon.cc b/source/compare_neon.cc new file mode 100644 index 00000000..afdd6012 --- /dev/null +++ b/source/compare_neon.cc @@ -0,0 +1,96 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + + asm volatile( + "vmov.u16 q4, #0 \n" // accumulator + + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" + + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" + + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "q0", "q1", "q2", "q3", "q4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc new file mode 100644 index 00000000..70fb9b91 --- /dev/null +++ b/source/compare_neon64.cc @@ -0,0 +1,94 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + asm volatile( + "movi v4.8h, #0 \n" + + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" + + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/compare_win.cc b/source/compare_win.cc new file mode 100644 index 00000000..9bb27f1d --- /dev/null +++ b/source/compare_win.cc @@ -0,0 +1,241 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#if defined(_MSC_VER) +#include // For __popcnt +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) + +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT + src_a += 4; + src_b += 4; + diff += __popcnt(x); + } + return diff; +} + +__declspec(naked) uint32_t + SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + + wloop: + movdqu xmm1, [eax] + lea eax, [eax + 16] + movdqu xmm2, [edx] + lea edx, [edx + 16] + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + sub ecx, 16 + jg wloop + + pshufd xmm1, xmm0, 0xee + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 0x01 + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +#ifdef HAS_SUMSQUAREERROR_AVX2 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable : 4752) +__declspec(naked) uint32_t + SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck + sub edx, eax + + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + sub ecx, 32 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 + vzeroupper + ret + } +} +#endif // HAS_SUMSQUAREERROR_AVX2 + +uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +__declspec(naked) uint32_t + HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, xmmword ptr kHash16x33 + + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + movdqa xmm5, xmmword ptr kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld xmm3, xmm5 + movdqa xmm5, xmmword ptr kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld xmm4, xmm5 + movdqa xmm5, xmmword ptr kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld xmm2, xmm5 + movdqa xmm5, xmmword ptr kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + sub ecx, 16 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +// Visual C 2012 required for AVX2. +#ifdef HAS_HASHDJB2_AVX2 +__declspec(naked) uint32_t + HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + vmovd xmm0, [esp + 12] // seed + + wloop: + vpmovzxbd xmm3, [eax] // src[0-3] + vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 + vpmovzxbd xmm4, [eax + 4] // src[4-7] + vpmulld xmm3, xmm3, xmmword ptr kHashMul0 + vpmovzxbd xmm2, [eax + 8] // src[8-11] + vpmulld xmm4, xmm4, xmmword ptr kHashMul1 + vpmovzxbd xmm1, [eax + 12] // src[12-15] + vpmulld xmm2, xmm2, xmmword ptr kHashMul2 + lea eax, [eax + 16] + vpmulld xmm1, xmm1, xmmword ptr kHashMul3 + vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm1, xmm1, xmm3 + vpshufd xmm2, xmm1, 0x0e // upper 2 dwords + vpaddd xmm1, xmm1,xmm2 + vpshufd xmm2, xmm1, 0x01 + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm0, xmm0, xmm1 + sub ecx, 16 + jg wloop + + vmovd eax, xmm0 // return hash + vzeroupper + ret + } +} +#endif // HAS_HASHDJB2_AVX2 + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert.cc b/source/convert.cc new file mode 100644 index 00000000..b11ab1bf --- /dev/null +++ b/source/convert.cc @@ -0,0 +1,4000 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/row.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/scale_row.h" // For FixedDiv +#include "libyuv/scale_uv.h" // For UVScale() + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Subsample amount uses a shift. +// v is value +// a is amount to add to round up +// s is shift to subsample down +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Any I4xx To I420 format with mirroring. +static int I4xxToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int src_uv_width, + int src_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (src_uv_width <= 0 || src_uv_height == 0) { + return -1; + } + if (dst_y) { + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +// Copy I420 with optional flipping. +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. +LIBYUV_API +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Copy I010 with optional flipping. +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +static int Planar16bitTo8bit(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + int scale = 1 << (24 - depth); + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + uv_height = -uv_height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (uv_height - 1) * src_stride_u; + src_v = src_v + (uv_height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width, + uv_height); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width, + uv_height); + return 0; +} + +static int I41xToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, scale, kFilterBilinear); + ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, scale, kFilterBilinear); + } + return 0; +} + +static int I21xToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + const int dy = FixedDiv(height, uv_height); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + } + return 0; +} + +// Convert 10 bit YUV to 8 bit. +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 10); +} + +LIBYUV_API +int I210ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 10); +} + +LIBYUV_API +int I210ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 0, 10); +} + +LIBYUV_API +int I410ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 10); +} + +LIBYUV_API +int I410ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 0, + 0, 10); +} + +LIBYUV_API +int I012ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 12); +} + +LIBYUV_API +int I212ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 0, 12); +} + +LIBYUV_API +int I212ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 12); +} + +LIBYUV_API +int I412ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 0, + 0, 12); +} + +LIBYUV_API +int I412ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 12); +} + +// Any Ix10 To I010 format with mirroring. +static int Ix10ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y) { + const int dst_y_width = Abs(width); + const int dst_y_height = Abs(height); + const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (width <= 0 || height == 0) { + return -1; + } + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +LIBYUV_API +int I410ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 0, 0); +} + +LIBYUV_API +int I210ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 1, 0); +} + +// Any I[420]1[02] to P[420]1[02] format with mirroring. +static int IxxxToPxxx(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + if (width <= 0 || height == 0) { + return -1; + } + + ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + depth); + MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, uv_width, uv_height, depth); + return 0; +} + +LIBYUV_API +int I010ToP010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 1, 10); +} + +LIBYUV_API +int I210ToP210(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 0, 10); +} + +LIBYUV_API +int I012ToP012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 1, 12); +} + +LIBYUV_API +int I212ToP212(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 0, 12); +} + +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int src_uv_width = SUBSAMPLE(width, 1, 1); + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, src_uv_width, height); +} + +LIBYUV_API +int I422ToI210(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + height); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + height); + return 0; +} + +// TODO(fbarchard): Implement row conversion. +LIBYUV_API +int I422ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Allocate u and v buffers + align_buffer_64(plane_u, halfwidth * halfheight * 2); + uint8_t* plane_v = plane_u + halfwidth * halfheight; + + I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, + height); + MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, + halfwidth, halfheight); + free_aligned_buffer_64(plane_u); + return 0; +} + +LIBYUV_API +int MM21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0) { + return -1; + } + + int sign = height < 0 ? -1 : 1; + + if (dst_y) { + DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32); + } + DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1, + (height + sign) / 2, 16); + + return 0; +} + +LIBYUV_API +int MM21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int sign = height < 0 ? -1 : 1; + + if (!src_uv || !dst_u || !dst_v || width <= 0) { + return -1; + } + + if (dst_y) { + DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32); + } + DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16); + + return 0; +} + +LIBYUV_API +int MM21ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + if (!src_y || !src_uv || !dst_yuy2 || width <= 0) { + return -1; + } + + DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2, + dst_stride_yuy2, width, height, 32); + + return 0; +} + +// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format +// documentation. +// TODO(greenjustin): Add an MT2T to I420 conversion. +LIBYUV_API +int MT2TToP010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || !height || !src_uv || !dst_uv) { + return -1; + } + + { + int uv_width = (width + 1) & ~1; + int uv_height = (height + 1) / 2; + int y = 0; + const int tile_width = 16; + const int y_tile_height = 32; + const int uv_tile_height = 16; + int padded_width = (width + tile_width - 1) & ~(tile_width - 1); + int y_tile_row_size = padded_width * y_tile_height * 10 / 8; + int uv_tile_row_size = padded_width * uv_tile_height * 10 / 8; + size_t row_buf_size = padded_width * y_tile_height * sizeof(uint16_t); + void (*UnpackMT2T)(const uint8_t* src, uint16_t* dst, size_t size) = + UnpackMT2T_C; + align_buffer_64(row_buf, row_buf_size); + +#if defined(HAS_UNPACKMT2T_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UnpackMT2T = UnpackMT2T_NEON; + } +#endif + // Negative height means invert the image. + if (height < 0) { + height = -height; + uv_height = (height + 1) / 2; + if (dst_y) { + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + + // Unpack and detile Y in rows of tiles + if (src_y && dst_y) { + for (y = 0; y < (height & ~(y_tile_height - 1)); y += y_tile_height) { + UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y, + width, y_tile_height, y_tile_height); + src_y += src_stride_y * y_tile_height; + dst_y += dst_stride_y * y_tile_height; + } + if (height & (y_tile_height - 1)) { + UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y, + width, height & (y_tile_height - 1), y_tile_height); + } + } + + // Unpack and detile UV plane + for (y = 0; y < (uv_height & ~(uv_tile_height - 1)); y += uv_tile_height) { + UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv, + uv_width, uv_tile_height, uv_tile_height); + src_uv += src_stride_uv * uv_tile_height; + dst_uv += dst_stride_uv * uv_tile_height; + } + if (uv_height & (uv_tile_height - 1)) { + UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv, + uv_width, uv_height & (uv_tile_height - 1), + uv_tile_height); + } + free_aligned_buffer_64(row_buf); + } + return 0; +} + +#ifdef I422TONV21_ROW_VERSION +// Unittest fails for this version. +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +// Swap src_u and src_v to implement I422ToNV12 +LIBYUV_API +int I422ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); + } + { + // Allocate 2 rows of vu. + int awidth = halfwidth * 2; + align_buffer_64(row_vu_0, awidth * 2); + uint8_t* row_vu_1 = row_vu_0 + awidth; + + for (y = 0; y < height - 1; y += 2) { + MergeUVRow(src_v, src_u, row_vu_0, halfwidth); + MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1, + halfwidth); + InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128); + src_u += src_stride_u * 2; + src_v += src_stride_v * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + MergeUVRow(src_v, src_u, dst_vu, halfwidth); + } + free_aligned_buffer_64(row_vu_0); + } + return 0; +} +#endif // I422TONV21_ROW_VERSION + +// 444 chroma is 1x width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, width, height); +} + +LIBYUV_API +int I444ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, width, height); + return 0; +} + +LIBYUV_API +int I444ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, + width, height); +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); + SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); + return 0; +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToNV21(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128); + return 0; +} + +// Convert NV12 to I420. +// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm. +LIBYUV_API +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && + dst_stride_v == halfwidth) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // Split UV plane - NV12 / NV21 + SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight); + + return 0; +} + +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. +LIBYUV_API +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, + width, height); +} + +LIBYUV_API +int NV12ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +LIBYUV_API +int NV16ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, + dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +// Any P[420]1[02] to I[420]1[02] format with mirroring. +static int PxxxToIxxx(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + if (width <= 0 || height == 0) { + return -1; + } + ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + depth); + SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, uv_width, uv_height, depth); + return 0; +} + +LIBYUV_API +int P010ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height, 1, 1, 10); +} + +LIBYUV_API +int P012ToI012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height, 1, 1, 12); +} + +LIBYUV_API +int P010ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +LIBYUV_API +int P210ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, + dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, + uint8_t* dst_u, uint8_t* dst_v, int width) = + YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUVRow = YUY2ToUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + YUY2ToUVRow = YUY2ToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + YUY2ToUVRow = YUY2ToUVRow_LSX; + } + } +#endif +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + YUY2ToUVRow = YUY2ToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + YUY2ToUVRow = YUY2ToUVRow_LASX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + } + return 0; +} + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, + uint8_t* dst_u, uint8_t* dst_v, int width) = + UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToUVRow = UYVYToUVRow_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUVRow = UYVYToUVRow_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + UYVYToUVRow = UYVYToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUVRow = UYVYToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUVRow = UYVYToUVRow_MSA; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUVRow = UYVYToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUVRow = UYVYToUVRow_LSX; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUVRow = UYVYToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUVRow = UYVYToUVRow_LSX; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + UYVYToYRow = UYVYToYRow_Any_LASX; + UYVYToUVRow = UYVYToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_LASX; + UYVYToUVRow = UYVYToUVRow_LASX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + } + return 0; +} + +// Convert AYUV to NV12. +LIBYUV_API +int AYUVToNV12(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv, + uint8_t* dst_uv, int width) = AYUVToUVRow_C; + void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = + AYUVToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; + src_stride_ayuv = -src_stride_ayuv; + } +// place holders for future intel code +#if defined(HAS_AYUVTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + AYUVToUVRow = AYUVToUVRow_Any_SSE2; + AYUVToYRow = AYUVToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + AYUVToUVRow = AYUVToUVRow_SSE2; + AYUVToYRow = AYUVToYRow_SSE2; + } + } +#endif +#if defined(HAS_AYUVTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AYUVToUVRow = AYUVToUVRow_Any_AVX2; + AYUVToYRow = AYUVToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + AYUVToUVRow = AYUVToUVRow_AVX2; + AYUVToYRow = AYUVToYRow_AVX2; + } + } +#endif + +#if defined(HAS_AYUVTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AYUVToYRow = AYUVToYRow_Any_NEON; + AYUVToUVRow = AYUVToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + AYUVToYRow = AYUVToYRow_NEON; + AYUVToUVRow = AYUVToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); + AYUVToYRow(src_ayuv, dst_y, width); + AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); + src_ayuv += src_stride_ayuv * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + AYUVToUVRow(src_ayuv, 0, dst_uv, width); + AYUVToYRow(src_ayuv, dst_y, width); + } + return 0; +} + +// Convert AYUV to NV21. +LIBYUV_API +int AYUVToNV21(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, + uint8_t* dst_vu, int width) = AYUVToVURow_C; + void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = + AYUVToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; + src_stride_ayuv = -src_stride_ayuv; + } +// place holders for future intel code +#if defined(HAS_AYUVTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + AYUVToVURow = AYUVToVURow_Any_SSE2; + AYUVToYRow = AYUVToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + AYUVToVURow = AYUVToVURow_SSE2; + AYUVToYRow = AYUVToYRow_SSE2; + } + } +#endif +#if defined(HAS_AYUVTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AYUVToVURow = AYUVToVURow_Any_AVX2; + AYUVToYRow = AYUVToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + AYUVToVURow = AYUVToVURow_AVX2; + AYUVToYRow = AYUVToYRow_AVX2; + } + } +#endif + +#if defined(HAS_AYUVTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AYUVToYRow = AYUVToYRow_Any_NEON; + AYUVToVURow = AYUVToVURow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + AYUVToYRow = AYUVToYRow_NEON; + AYUVToVURow = AYUVToVURow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); + AYUVToYRow(src_ayuv, dst_y, width); + AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); + src_ayuv += src_stride_ayuv * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + AYUVToVURow(src_ayuv, 0, dst_vu, width); + AYUVToYRow(src_ayuv, dst_y, width); + } + return 0; +} + +// Convert ARGB to I420. +LIBYUV_API +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +#ifdef USE_EXTRACTALPHA +// Convert ARGB to I420 with Alpha +// The following version calls ARGBExtractAlpha on the full image. +LIBYUV_API +int ARGBToI420Alpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height); + if (r == 0) { + r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width, + height); + } + return r; +} +#else // USE_EXTRACTALPHA +// Convert ARGB to I420 with Alpha +LIBYUV_API +int ARGBToI420Alpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 + : ARGBExtractAlphaRow_Any_SSE2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON + : ARGBExtractAlphaRow_Any_NEON; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX + : ARGBExtractAlphaRow_Any_LSX; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + ARGBExtractAlphaRow(src_argb, dst_a, width); + ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a, + width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + dst_a += dst_stride_a * 2; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBExtractAlphaRow(src_argb, dst_a, width); + } + return 0; +} +#endif // USE_EXTRACTALPHA + +// Convert BGRA to I420. +LIBYUV_API +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, + uint8_t* dst_u, uint8_t* dst_v, int width) = + BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = + BGRAToYRow_C; + if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } +#if defined(HAS_BGRATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToYRow = BGRAToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_NEON; + } + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; + } + } +#endif +#if defined(HAS_BGRATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BGRAToYRow = BGRAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_BGRATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_BGRATOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BGRAToYRow = BGRAToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BGRAToYRow = BGRAToYRow_AVX2; + } + } +#endif +#if defined(HAS_BGRATOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BGRAToUVRow = BGRAToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BGRAToUVRow = BGRAToUVRow_AVX2; + } + } +#endif +#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToYRow = BGRAToYRow_Any_MSA; + BGRAToUVRow = BGRAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + BGRAToUVRow = BGRAToUVRow_MSA; + } + } +#endif +#if defined(HAS_BGRATOYROW_LSX) && defined(HAS_BGRATOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + BGRAToYRow = BGRAToYRow_Any_LSX; + BGRAToUVRow = BGRAToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_LSX; + BGRAToUVRow = BGRAToUVRow_LSX; + } + } +#endif +#if defined(HAS_BGRATOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + BGRAToYRow = BGRAToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + BGRAToYRow = BGRAToYRow_LASX; + } + } +#endif +#if defined(HAS_BGRATOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BGRAToYRow = BGRAToYRow_RVV; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); + src_bgra += src_stride_bgra * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + } + return 0; +} + +// Convert ABGR to I420. +LIBYUV_API +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = + ABGRToYRow_C; + if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LSX) && defined(HAS_ABGRTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYRow = ABGRToYRow_Any_LSX; + ABGRToUVRow = ABGRToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_LSX; + ABGRToUVRow = ABGRToUVRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYRow = ABGRToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + } + return 0; +} + +// Convert RGBA to I420. +LIBYUV_API +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = + RGBAToYRow_C; + if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } +#if defined(HAS_RGBATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToYRow = RGBAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_RGBATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_RGBATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYRow = RGBAToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYRow = RGBAToYRow_Any_MSA; + RGBAToUVRow = RGBAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_MSA; + RGBAToUVRow = RGBAToUVRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOYROW_LSX) && defined(HAS_RGBATOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGBAToYRow = RGBAToYRow_Any_LSX; + RGBAToUVRow = RGBAToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_LSX; + RGBAToUVRow = RGBAToUVRow_LSX; + } + } +#endif +#if defined(HAS_RGBATOYROW_LASX) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYRow = RGBAToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGBAToYRow = RGBAToYRow_LASX; + } + } +#endif +#if defined(HAS_RGBATOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGBAToYRow = RGBAToYRow_RVV; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); + src_rgba += src_stride_rgba * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + } + return 0; +} + +// Enabled if 1 pass is available +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV)) +#define HAS_RGB24TOYROW +#endif + +// Convert RGB24 to I420. +LIBYUV_API +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_RGB24TOYROW) + void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = + RGB24ToYRow_C; +#else + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +#if defined(HAS_RGB24TOYROW) + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_NEON; + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MSA; + RGB24ToYRow = RGB24ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_MSA; + RGB24ToUVRow = RGB24ToUVRow_MSA; + } + } +#endif +#if defined(HAS_RGB24TOYROW_LSX) && defined(HAS_RGB24TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LSX; + RGB24ToYRow = RGB24ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_LSX; + RGB24ToUVRow = RGB24ToUVRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LASX; + RGB24ToYRow = RGB24ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYRow = RGB24ToYRow_LASX; + RGB24ToUVRow = RGB24ToUVRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYRow = RGB24ToYRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else // HAS_RGB24TOYROW + +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#endif // HAS_RGB24TOYROW + + { +#if !defined(HAS_RGB24TOYROW) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB24TOYROW) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB24TOYROW) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB24TOYROW) + free_aligned_buffer_64(row); +#endif + } + return 0; +} +#undef HAS_RGB24TOYROW + +// Enabled if 1 pass is available +#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_RVV) +#define HAS_RGB24TOYJROW +#endif + +// Convert RGB24 to J420. +LIBYUV_API +int RGB24ToJ420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_RGB24TOYJROW) + void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVJRow_C; + void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = + RGB24ToYJRow_C; +#else + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +#if defined(HAS_RGB24TOYJROW) + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; + RGB24ToYJRow = RGB24ToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_NEON; + RGB24ToUVJRow = RGB24ToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; + RGB24ToYJRow = RGB24ToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_MSA; + RGB24ToUVJRow = RGB24ToUVJRow_MSA; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYJRow = RGB24ToYJRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else // HAS_RGB24TOYJROW + +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#endif // HAS_RGB24TOYJROW + + { +#if !defined(HAS_RGB24TOYJROW) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB24TOYJROW) + RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYJRow(src_rgb24, dst_y, width); + RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width); + ARGBToUVJRow(row, row_size, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB24TOYJROW) + RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYJRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVJRow(row, 0, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB24TOYJROW) + free_aligned_buffer_64(row); +#endif + } + return 0; +} +#undef HAS_RGB24TOYJROW + +// Enabled if 1 pass is available +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV)) +#define HAS_RAWTOYROW +#endif + +// Convert RAW to I420. +LIBYUV_API +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_RAWTOYROW) + void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, + uint8_t* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVRow = RAWToUVRow_Any_NEON; + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_NEON; + RAWToUVRow = RAWToUVRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVRow = RAWToUVRow_Any_MSA; + RAWToYRow = RAWToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_MSA; + RAWToUVRow = RAWToUVRow_MSA; + } + } +#endif +#if defined(HAS_RAWTOYROW_LSX) && defined(HAS_RAWTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToUVRow = RAWToUVRow_Any_LSX; + RAWToYRow = RAWToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_LSX; + RAWToUVRow = RAWToUVRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToUVRow = RAWToUVRow_Any_LASX; + RAWToYRow = RAWToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYRow = RAWToYRow_LASX; + RAWToUVRow = RAWToUVRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYRow = RAWToYRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYROW + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#endif // HAS_RAWTOYROW + + { +#if !defined(HAS_RAWTOYROW) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYROW) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RAWTOYROW) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYROW) + free_aligned_buffer_64(row); +#endif + } + return 0; +} +#undef HAS_RAWTOYROW + +// Enabled if 1 pass is available +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \ + defined(HAS_RAWTOYJROW_RVV) +#define HAS_RAWTOYJROW +#endif + +// Convert RAW to J420. +LIBYUV_API +int RAWToJ420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_RAWTOYJROW) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYJROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + RAWToUVJRow = RAWToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVJRow = RAWToUVJRow_Any_MSA; + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + RAWToUVJRow = RAWToUVJRow_MSA; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYJROW + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#endif // HAS_RAWTOYJROW + + { +#if !defined(HAS_RAWTOYJROW) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); + ARGBToUVJRow(row, row_size, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, 0, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVJRow(row, 0, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYJROW) + free_aligned_buffer_64(row); +#endif + } + return 0; +} +#undef HAS_RAWTOYJROW + +// Convert RGB565 to I420. +LIBYUV_API +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) + void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = + RGB565ToYRow_C; +#else + void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + +// Neon version does direct RGB565 to YUV. +#if defined(HAS_RGB565TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + RGB565ToYRow = RGB565ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; + } + } + } +// MSA version does direct RGB565 to YUV. +#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \ + defined(HAS_RGB565TOYROW_LASX)) +#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MSA; + RGB565ToYRow = RGB565ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_MSA; + RGB565ToUVRow = RGB565ToUVRow_MSA; + } + } +#endif +#if defined(HAS_RGB565TOYROW_LSX) && defined(HAS_RGB565TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB565ToUVRow = RGB565ToUVRow_Any_LSX; + RGB565ToYRow = RGB565ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_LSX; + RGB565ToUVRow = RGB565ToUVRow_LSX; + } + } +#endif +#if defined(HAS_RGB565TOYROW_LASX) && defined(HAS_RGB565TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB565ToUVRow = RGB565ToUVRow_Any_LASX; + RGB565ToYRow = RGB565ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB565ToYRow = RGB565ToYRow_LASX; + RGB565ToUVRow = RGB565ToUVRow_LASX; + } + } +#endif +// Other platforms do intermediate conversion from RGB565 to ARGB. +#else +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#endif + { +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert ARGB1555 to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) + void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, + int width) = ARGB1555ToYRow_C; +#else + void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + +// Neon version does direct ARGB1555 to YUV. +#if defined(HAS_ARGB1555TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } + } +// MSA version does direct ARGB1555 to YUV. +#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \ + defined(HAS_ARGB1555TOYROW_LASX)) +#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_MSA; + ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LSX; + ARGB1555ToYRow = ARGB1555ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_LSX; + ARGB1555ToUVRow = ARGB1555ToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGB1555TOYROW_LASX) && defined(HAS_ARGB1555TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LASX; + ARGB1555ToYRow = ARGB1555ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToYRow = ARGB1555ToYRow_LASX; + ARGB1555ToUVRow = ARGB1555ToUVRow_LASX; + } + } +#endif +// Other platforms do intermediate conversion from ARGB1555 to ARGB. +#else +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#endif + { +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + row_size, + width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert ARGB4444 to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_ARGB4444TOYROW_NEON) + void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, + int width) = ARGB4444ToYRow_C; +#else + void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + +// Neon version does direct ARGB4444 to YUV. +#if defined(HAS_ARGB4444TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from ARGB4444 to ARGB. +#else +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#endif + + { +#if !(defined(HAS_ARGB4444TOYROW_NEON)) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + row_size, + width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_ARGB4444TOYROW_NEON)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert RGB24 to J400. +LIBYUV_API +int RGB24ToJ400(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = + RGB24ToYJRow_C; + if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_yj = 0; + } +#if defined(HAS_RGB24TOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_AVX2; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToYJRow = RGB24ToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToYJRow = RGB24ToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_MSA; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYJRow = RGB24ToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToYJRow(src_rgb24, dst_yj, width); + src_rgb24 += src_stride_rgb24; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert RAW to J400. +LIBYUV_API +int RAWToJ400(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = + RAWToYJRow_C; + if (!src_raw || !dst_yj || width <= 0 || height == 0) { + return -1; + } + + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_raw = dst_stride_yj = 0; + } + +#if defined(HAS_RAWTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToYJRow = RAWToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToYJRow = RAWToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_AVX2; + } + } +#endif +#if defined(HAS_RAWTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RAWToYJRow(src_raw, dst_yj, width); + src_raw += src_stride_raw; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert Android420 to I420. +LIBYUV_API +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Android420ToI420Rotate(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_pixel_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height, kRotate0); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert_argb.cc b/source/convert_argb.cc new file mode 100644 index 00000000..cc6560de --- /dev/null +++ b/source/convert_argb.cc @@ -0,0 +1,8277 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include + +#include "libyuv/convert_from_argb.h" +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle. +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" // For ScaleRowUp2_Linear and ScaleRowUp2_Bilinear +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB with optional flipping +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, + height); + return 0; +} + +// Convert I420 to ARGB with matrix. +LIBYUV_API +int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert J420 to ABGR. +LIBYUV_API +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H420 to ARGB. +LIBYUV_API +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H420 to ABGR. +LIBYUV_API +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U420 to ARGB. +LIBYUV_API +int U420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U420 to ABGR. +LIBYUV_API +int U420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert I422 to ARGB with matrix. +LIBYUV_API +int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert J422 to ABGR. +LIBYUV_API +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H422 to ARGB. +LIBYUV_API +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H422 to ABGR. +LIBYUV_API +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U422 to ARGB. +LIBYUV_API +int U422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U422 to ABGR. +LIBYUV_API +int U422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert I444 to ARGB with matrix. +LIBYUV_API +int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I444ToARGBRow = I444ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I444TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToARGBRow = I444ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J444 to ARGB. +LIBYUV_API +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert J444 to ABGR. +LIBYUV_API +int J444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H444 to ARGB. +LIBYUV_API +int H444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H444 to ABGR. +LIBYUV_API +int H444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U444 to ARGB. +LIBYUV_API +int U444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U444 to ABGR. +LIBYUV_API +int U444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert I444 to RGB24 with matrix. +LIBYUV_API +int I444ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && + dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0; + } +#if defined(HAS_I444TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToRGB24Row = I444ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToRGB24Row = I444ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToRGB24Row = I444ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I444ToRGB24Row = I444ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToRGB24Row = I444ToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 to RGB24. +LIBYUV_API +int I444ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I444ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I444 to RAW. +LIBYUV_API +int I444ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I444ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert 10 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert U010 to AR30. +LIBYUV_API +int U010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuv2020Constants, width, height); +} + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert U010 to AB30. +LIBYUV_API +int U010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYuv2020Constants, width, height); +} + +// Convert 12 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to +// multiply 12 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I012ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToAR30Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I212TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToAR30Row = I212ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToAR30Row = I212ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I212TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToAR30Row = I212ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToAR30Row = I212ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert 10 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I210 to AR30. +LIBYUV_API +int I210ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H210 to AR30. +LIBYUV_API +int H210ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert U210 to AR30. +LIBYUV_API +int U210ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuv2020Constants, width, height); +} + +// Convert I210 to AB30. +LIBYUV_API +int I210ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H210 to AB30. +LIBYUV_API +int H210ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert U210 to AB30. +LIBYUV_API +int U210ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYuv2020Constants, width, height); +} + +LIBYUV_API +int I410ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert 10 bit YUV to ARGB with matrix. +LIBYUV_API +int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToARGBRow = I210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U010 to ARGB. +LIBYUV_API +int U010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U010 to ABGR. +LIBYUV_API +int U010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert 12 bit YUV to ARGB with matrix. +LIBYUV_API +int I012ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I212TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToARGBRow = I212ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToARGBRow = I212ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I212TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToARGBRow = I212ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToARGBRow = I212ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert 10 bit 422 YUV to ARGB with matrix. +LIBYUV_API +int I210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToARGBRow = I210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I210 to ARGB. +LIBYUV_API +int I210ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I210 to ABGR. +LIBYUV_API +int I210ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H210 to ARGB. +LIBYUV_API +int H210ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H210 to ABGR. +LIBYUV_API +int H210ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U210 to ARGB. +LIBYUV_API +int U210ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U210 to ABGR. +LIBYUV_API +int U210ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +LIBYUV_API +int I410ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +LIBYUV_API +int P010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + +LIBYUV_API +int P010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + +// Convert I420 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I422AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I422 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I422AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I422AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I444AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I444AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I420 with Alpha to ARGB. +LIBYUV_API +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); +} + +// Convert I420 with Alpha to ABGR. +LIBYUV_API +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + +// Convert I422 with Alpha to ARGB. +LIBYUV_API +int I422AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); +} + +// Convert I422 with Alpha to ABGR. +LIBYUV_API +int I422AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I422AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + +// Convert I444 with Alpha to ARGB. +LIBYUV_API +int I444AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); +} + +// Convert I444 with Alpha to ABGR. +LIBYUV_API +int I444AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I444AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + +// Convert I010 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I010AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I210 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I210AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I410 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I410AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I400 to ARGB with matrix. +LIBYUV_API +int I400ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I400ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_I400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I400ToARGBRow = I400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_I400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I400ToARGBRow = I400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I400ToARGBRow = I400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I400ToARGBRow = I400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I400TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I400ToARGBRow = I400ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I400TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I400ToARGBRow = I400ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert J400 to ARGB. +LIBYUV_API +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = + J400ToARGBRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_J400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + J400ToARGBRow = J400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + J400ToARGBRow = J400ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J400ToARGBRow = J400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J400ToARGBRow = J400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J400ToARGBRow = J400ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_J400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + J400ToARGBRow = J400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_J400TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + J400ToARGBRow = J400ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_J400TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + J400ToARGBRow = J400ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + J400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle table for converting BGRA to ARGB. +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; + +// Shuffle table for converting ABGR to ARGB. +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; + +// Shuffle table for converting RGBA to ARGB. +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; + +// Shuffle table for converting AR64 to AB64. +static const uvec8 kShuffleMaskAR64ToAB64 = { + 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u}; + +// Convert BGRA to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); +} + +// Convert ARGB to BGRA (same as BGRAToARGB). +LIBYUV_API +int ARGBToBGRA(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); +} + +// Convert ABGR to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); +} + +// Convert ARGB to ABGR to (same as ABGRToARGB). +LIBYUV_API +int ARGBToABGR(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); +} + +// Convert RGBA to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, + (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height); +} + +// Convert AR64 To AB64. +LIBYUV_API +int AR64ToAB64(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64, + (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height); +} + +// Convert RGB24 to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_argb = 0; + } +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToARGBRow = RGB24ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToARGBRow(src_rgb24, dst_argb, width); + src_rgb24 += src_stride_rgb24; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to ARGB. +LIBYUV_API +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + if (!src_raw || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_argb = 0; + } +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToARGBRow = RAWToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToARGBRow = RAWToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToARGBRow = RAWToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to RGBA. +LIBYUV_API +int RAWToRGBA(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + int y; + void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) = + RAWToRGBARow_C; + if (!src_raw || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_rgba = 0; + } +#if defined(HAS_RAWTORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToRGBARow = RAWToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToRGBARow = RAWToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToRGBARow = RAWToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToRGBARow = RAWToRGBARow_NEON; + } + } +#endif +#if defined(HAS_RAWTORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToRGBARow = RAWToRGBARow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RAWToRGBARow(src_raw, dst_rgba, width); + src_raw += src_stride_raw; + dst_rgba += dst_stride_rgba; + } + return 0; +} + +// Convert RGB565 to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + // Coalesce rows. + if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb565 = dst_stride_argb = 0; + } +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB565ToARGBRow = RGB565ToARGBRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB565ToARGBRow(src_rgb565, dst_argb, width); + src_rgb565 += src_stride_rgb565; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB1555 to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + // Coalesce rows. + if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb1555 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB1555ToARGBRow(src_argb1555, dst_argb, width); + src_argb1555 += src_stride_argb1555; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB4444 to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + // Coalesce rows. + if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb4444 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB4444ToARGBRow(src_argb4444, dst_argb, width); + src_argb4444 += src_stride_argb4444; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ARGB. +LIBYUV_API +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_argb = 0; + } + for (y = 0; y < height; ++y) { + AR30ToARGBRow_C(src_ar30, dst_argb, width); + src_ar30 += src_stride_ar30; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + int y; + if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_abgr = 0; + } + for (y = 0; y < height; ++y) { + AR30ToABGRRow_C(src_ar30, dst_abgr, width); + src_ar30 += src_stride_ar30; + dst_abgr += dst_stride_abgr; + } + return 0; +} + +// Convert AR30 to AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + int y; + if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_ab30 = 0; + } + for (y = 0; y < height; ++y) { + AR30ToAB30Row_C(src_ar30, dst_ab30, width); + src_ar30 += src_stride_ar30; + dst_ab30 += dst_stride_ab30; + } + return 0; +} + +// Convert AR64 to ARGB. +LIBYUV_API +int AR64ToARGB(const uint16_t* src_ar64, + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AR64ToARGBRow_C; + if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_argb = 0; + } +#if defined(HAS_AR64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AR64ToARGBRow = AR64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ToARGBRow = AR64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ToARGBRow = AR64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + AR64ToARGBRow = AR64ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + AR64ToARGBRow(src_ar64, dst_argb, width); + src_ar64 += src_stride_ar64; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AB64 to ARGB. +LIBYUV_API +int AB64ToARGB(const uint16_t* src_ab64, + int src_stride_ab64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AB64ToARGBRow_C; + if (!src_ab64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ab64 = src_ab64 + (height - 1) * src_stride_ab64; + src_stride_ab64 = -src_stride_ab64; + } + // Coalesce rows. + if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ab64 = dst_stride_argb = 0; + } +#if defined(HAS_AB64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AB64ToARGBRow = AB64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AB64ToARGBRow = AB64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AB64ToARGBRow = AB64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + AB64ToARGBRow = AB64ToARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + AB64ToARGBRow(src_ab64, dst_argb, width); + src_ab64 += src_stride_ab64; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert NV12 to ARGB with matrix. +LIBYUV_API +int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + NV12ToARGBRow = NV12ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV12ToARGBRow = NV12ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB with matrix. +LIBYUV_API +int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; + assert(yuvconstants); + if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToARGBRow = NV21ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + NV21ToARGBRow = NV21ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV21ToARGBRow = NV21ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV12 to ABGR. +// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix. +// To swap the UV use NV12 instead of NV21.LIBYUV_API +LIBYUV_API +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// TODO(fbarchard): Consider SSSE3 2 step conversion. +// Convert NV12 to RGB24 with matrix. +LIBYUV_API +int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV12TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB24Row = NV12ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV12ToRGB24Row = NV12ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB24 with matrix. +LIBYUV_API +int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV21TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB24Row = NV21ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToRGB24Row = NV21ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV12 to RAW. +LIBYUV_API +int NV12ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw, + dst_stride_raw, &kYvuI601Constants, width, height); +} + +// Convert NV21 to RAW. +LIBYUV_API +int NV21ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw, + dst_stride_raw, &kYvuI601Constants, width, height); +} + +// Convert NV21 to YUV24 +int NV21ToYUV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_yuv24, + int dst_stride_yuv24, + int width, + int height) { + int y; + void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu, + uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C; + if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24; + dst_stride_yuv24 = -dst_stride_yuv24; + } +#if defined(HAS_NV21TOYUV24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + NV21ToYUV24Row = NV21ToYUV24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TOYUV24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToYUV24Row = NV21ToYUV24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TOYUV24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToYUV24Row = NV21ToYUV24Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width); + dst_yuv24 += dst_stride_yuv24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = + YUY2ToARGBRow_C; + if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_argb = 0; + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToARGBRow = YUY2ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_LSX; + } + } +#endif + for (y = 0; y < height; ++y) { + YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); + src_yuy2 += src_stride_yuy2; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = + UYVYToARGBRow_C; + if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_argb = 0; + } +#if defined(HAS_UYVYTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + UYVYToARGBRow = UYVYToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToARGBRow = UYVYToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToARGBRow = UYVYToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToARGBRow = UYVYToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_NEON; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToARGBRow = UYVYToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_MSA; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToARGBRow = UYVYToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_LSX; + } + } +#endif + for (y = 0; y < height; ++y) { + UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); + src_uyvy += src_stride_uyvy; + dst_argb += dst_stride_argb; + } + return 0; +} +static void WeavePixels(const uint8_t* src_u, + const uint8_t* src_v, + int src_pixel_stride_uv, + uint8_t* dst_uv, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_uv[0] = *src_u; + dst_uv[1] = *src_v; + dst_uv += 2; + src_u += src_pixel_stride_uv; + src_v += src_pixel_stride_uv; + } +} + +// Convert Android420 to ARGB with matrix. +LIBYUV_API +int Android420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + uint8_t* dst_uv; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + // I420 + if (src_pixel_stride_uv == 1) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + // NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, + dst_stride_argb, yuvconstants, width, height); + // NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, + dst_stride_argb, yuvconstants, width, height); + } + + // General case fallback creates NV12 + align_buffer_64(plane_uv, halfwidth * 2 * halfheight); + dst_uv = plane_uv; + for (y = 0; y < halfheight; ++y) { + WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += halfwidth * 2; + } + NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb, + dst_stride_argb, yuvconstants, width, height); + free_aligned_buffer_64(plane_uv); + return 0; +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_pixel_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height); +} + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, src_pixel_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, + height); +} + +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif +#if defined(HAS_I422TORGBAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGBARow = I422ToRGBARow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_LSX; + } + } +#endif +#if defined(HAS_I422TORGBAROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGBARow = I422ToRGBARow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGBARow = I422ToRGBARow_LASX; + } + } +#endif +#if defined(HAS_I422TORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGBARow = I422ToRGBARow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert NV12 to RGB565 with matrix. +LIBYUV_API +int NV12ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB565Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_LSX; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb565, dst_stride_rgb565, &kYuvI601Constants, + width, height); +} + +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif +#if defined(HAS_I422TORGBAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGBARow = I422ToRGBARow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_LSX; + } + } +#endif +#if defined(HAS_I422TORGBAROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGBARow = I422ToRGBARow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGBARow = I422ToRGBARow_LASX; + } + } +#endif +#if defined(HAS_I422TORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGBARow = I422ToRGBARow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to RGB24 with matrix. +LIBYUV_API +int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_LASX; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGB24Row = I422ToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J420 to RGB24. +LIBYUV_API +int J420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvJPEGConstants, width, height); +} + +// Convert J420 to RAW. +LIBYUV_API +int J420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I422 to RGB24 with matrix. +LIBYUV_API +int I422ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_LASX; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGB24Row = I422ToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGB24. +LIBYUV_API +int I422ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I422 to RAW. +LIBYUV_API +int I422ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I422ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_LSX; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, + width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { + int y; + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_LSX; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, + width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565 with specified color matrix. +LIBYUV_API +int I420ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_LSX; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB565Row = I422ToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvI601Constants, width, height); +} + +// Convert J420 to RGB565. +LIBYUV_API +int J420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvJPEGConstants, width, height); +} + +// Convert H420 to RGB565. +LIBYUV_API +int H420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvH709Constants, width, height); +} + +// Convert I422 to RGB565 with specified color matrix. +LIBYUV_API +int I422ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_LSX; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB565Row = I422ToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I422ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvI601Constants, width, height); +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, +}; + +// Convert I420 to RGB565 with dithering. +LIBYUV_API +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX; + } + } +#endif + { + // Allocate a row of argb. + align_buffer_64(row_argb, width * 4); + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); + ARGBToRGB565DitherRow(row_argb, dst_rgb565, + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row_argb); + } + return 0; +} + +// Convert I420 to AR30 with matrix. +LIBYUV_API +int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + +// Convert I420 to AB30. +LIBYUV_API +int I420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H420 to AB30. +LIBYUV_API +int H420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +static int I420ToARGBMatrixBilinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, + int dst_width) = ScaleRowUp2_Bilinear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444ToARGBRow = I444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I444ToARGBRow = I444ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I444TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToARGBRow = I444ToARGBRow_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4); + uint8_t* temp_u_1 = row; + uint8_t* temp_u_2 = row + row_size; + uint8_t* temp_v_1 = row + row_size * 2; + uint8_t* temp_v_2 = row + row_size * 3; + + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width); + I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + I444ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I422ToARGBMatrixLinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444ToARGBRow = I444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I444ToARGBRow = I444ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I444TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToARGBRow = I444ToARGBRow_RVV; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2); + uint8_t* temp_u = row; + uint8_t* temp_v = row + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); + I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToRGB24Row_C; + void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, + int dst_width) = ScaleRowUp2_Bilinear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I444TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToRGB24Row = I444ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToRGB24Row = I444ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToRGB24Row = I444ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444ToRGB24Row = I444ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_LASX; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToRGB24Row = I444ToRGB24Row_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4); + uint8_t* temp_u_1 = row; + uint8_t* temp_u_2 = row + row_size; + uint8_t* temp_v_1 = row + row_size * 2; + uint8_t* temp_v_2 = row + row_size * 3; + + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width); + I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + I444ToRGB24Row(src_y, temp_u_2, temp_v_2, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I010ToAR30MatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + void (*Scale2RowUp_Bilinear_12)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4 * sizeof(uint16_t)); + uint16_t* temp_u_1 = (uint16_t*)(row); + uint16_t* temp_u_2 = (uint16_t*)(row) + row_size; + uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3; + + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); + I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width); + I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + I410ToAR30Row(src_y, temp_u_2, temp_v_2, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); + I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); + } + + free_aligned_buffer_64(row); + + return 0; +} + +static int I210ToAR30MatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); + uint16_t* temp_u = (uint16_t*)(row); + uint16_t* temp_v = (uint16_t*)(row) + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear_12(src_u, temp_u, width); + ScaleRowUp2_Linear_12(src_v, temp_v, width); + I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + free_aligned_buffer_64(row); + return 0; +} + +static int I010ToARGBMatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + void (*Scale2RowUp_Bilinear_12)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4 * sizeof(uint16_t)); + uint16_t* temp_u_1 = (uint16_t*)(row); + uint16_t* temp_u_2 = (uint16_t*)(row) + row_size; + uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3; + + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); + I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width); + I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + I410ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); + I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I210ToARGBMatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); + uint16_t* temp_u = (uint16_t*)(row); + uint16_t* temp_v = (uint16_t*)(row) + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear_12(src_u, temp_u, width); + ScaleRowUp2_Linear_12(src_v, temp_v, width); + I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I420AlphaToARGBMatrixBilinear( + const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I444AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, + int dst_width) = ScaleRowUp2_Bilinear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4); + uint8_t* temp_u_1 = row; + uint8_t* temp_u_2 = row + row_size; + uint8_t* temp_v_1 = row + row_size * 2; + uint8_t* temp_v_2 = row + row_size * 3; + + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width); + I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + I444AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + src_a += src_stride_a; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I444AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2); + uint8_t* temp_u = row; + uint8_t* temp_v = row + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); + I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I010AlphaToARGBMatrixBilinear( + const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*Scale2RowUp_Bilinear_12)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4 * sizeof(uint16_t)); + uint16_t* temp_u_1 = (uint16_t*)(row); + uint16_t* temp_u_2 = (uint16_t*)(row) + row_size; + uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3; + + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); + I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width); + I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + I410AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); + I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*ScaleRowUp2_Linear)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); + uint16_t* temp_u = (uint16_t*)(row); + uint16_t* temp_v = (uint16_t*)(row) + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); + I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + free_aligned_buffer_64(row); + return 0; +} + +static int P010ToARGBMatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C; + void (*Scale2RowUp_Bilinear_16)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToARGBRow = P410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToARGBRow = P410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToARGBRow = P410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToARGBRow = P410ToARGBRow_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); + uint16_t* temp_uv_1 = (uint16_t*)(row); + uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size; + + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); + P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width); + P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + P410ToARGBRow(src_y, temp_uv_2, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + if (!(height & 1)) { + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); + P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int P210ToARGBMatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C; + void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv, + int dst_width) = ScaleUVRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToARGBRow = P410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToARGBRow = P410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToARGBRow = P410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToARGBRow = P410ToARGBRow_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * sizeof(uint16_t)); + uint16_t* temp_uv = (uint16_t*)(row); + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_uv, temp_uv, width); + P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int P010ToAR30MatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C; + void (*Scale2RowUp_Bilinear_16)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToAR30Row = P410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToAR30Row = P410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToAR30Row = P410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToAR30Row = P410ToAR30Row_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); + uint16_t* temp_uv_1 = (uint16_t*)(row); + uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size; + + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); + P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width); + P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + P410ToAR30Row(src_y, temp_uv_2, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + if (!(height & 1)) { + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); + P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int P210ToAR30MatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C; + void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv, + int dst_width) = ScaleUVRowUp2_Linear_16_Any_C; + assert(yuvconstants); + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToAR30Row = P410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToAR30Row = P410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToAR30Row = P410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToAR30Row = P410ToAR30Row_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * sizeof(uint16_t)); + uint16_t* temp_uv = (uint16_t*)(row); + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_uv, temp_uv, width); + P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I422ToRGB24MatrixLinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToRGB24Row_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I444TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToRGB24Row = I444ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToRGB24Row = I444ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToRGB24Row = I444ToRGB24Row_RVV; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2); + uint8_t* temp_u = row; + uint8_t* temp_v = row + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); + I444ToRGB24Row(src_y, temp_u, temp_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +LIBYUV_API +int I422ToRGB24MatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I422ToRGB24MatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_rgb24, dst_stride_rgb24, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I420ToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + return I420ToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + case kFilterLinear: + // Actually we can do this, but probably there's no usage. + return -1; + } + + return -1; +} + +LIBYUV_API +int I422ToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I422ToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I420ToRGB24MatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return I420ToRGB24MatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_rgb24, dst_stride_rgb24, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I010ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return I010ToAR30MatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_ar30, dst_stride_ar30, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I210ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I210ToAR30MatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_ar30, dst_stride_ar30, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I010ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return I010ToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I210ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I210ToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I420AlphaToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return I420AlphaToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + } + + return -1; +} + +LIBYUV_API +int I422AlphaToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I422AlphaToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + } + + return -1; +} + +LIBYUV_API +int I010AlphaToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I010AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return I010AlphaToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + } + + return -1; +} + +LIBYUV_API +int I210AlphaToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I210AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I210AlphaToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + } + + return -1; +} + +// TODO(fb): Verify this function works correctly. P010 is like NV12 but 10 bit +// UV is biplanar. +LIBYUV_API +int P010ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_argb, dst_stride_argb, yuvconstants, width, + height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv, + src_stride_uv, dst_argb, dst_stride_argb, + yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int P210ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P210ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_argb, dst_stride_argb, yuvconstants, width, + height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return P210ToARGBMatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv, + dst_argb, dst_stride_argb, yuvconstants, + width, height); + } + + return -1; +} + +LIBYUV_API +int P010ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_ar30, dst_stride_ar30, yuvconstants, width, + height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv, + src_stride_uv, dst_ar30, dst_stride_ar30, + yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int P210ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P210ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_ar30, dst_stride_ar30, yuvconstants, width, + height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return P210ToAR30MatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv, + dst_ar30, dst_stride_ar30, yuvconstants, + width, height); + } + + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert_from.cc b/source/convert_from.cc new file mode 100644 index 00000000..4102d610 --- /dev/null +++ b/source/convert_from.cc @@ -0,0 +1,882 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from.h" + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" // For I420Copy +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/row.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// I420 To any I4xx YUV format with mirroring. +// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane + +static int I420ToI4xx(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int dst_uv_width, + int dst_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); + const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || + dst_uv_height <= 0) { + return -1; + } + if (dst_y) { + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +// Convert 8 bit YUV to 10 bit. +LIBYUV_API +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + halfheight); + return 0; +} + +// Convert 8 bit YUV to 12 bit. +LIBYUV_API +int I420ToI012(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth, + halfheight); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 422 chroma is 1/2 width, 1x height +LIBYUV_API +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int dst_uv_width = (Abs(width) + 1) >> 1; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int dst_uv_width = Abs(width); + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); +} + +// 420 chroma to 444 chroma, 10/12 bit version +LIBYUV_API +int I010ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width), + Abs(height), kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +// 422 chroma to 444 chroma, 10/12 bit version +LIBYUV_API +int I210ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, + dst_stride_u, Abs(width), Abs(height), kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, + dst_stride_v, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I422ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, + dst_stride_u, Abs(width), Abs(height), kFilterBilinear); + ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, + dst_stride_v, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +LIBYUV_API +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +LIBYUV_API +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + int y; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2; + } + return 0; +} + +LIBYUV_API +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + int y; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_LSX; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_LASX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, + dst_yuy2 + dst_stride_yuy2, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2 * 2; + } + if (height & 1) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + } + return 0; +} + +LIBYUV_API +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { + int y; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy; + } + return 0; +} + +LIBYUV_API +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { + int y; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, + dst_uyvy + dst_stride_uyvy, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy * 2; + } + if (height & 1) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + } + return 0; +} + +LIBYUV_API +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; + if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, + halfwidth, halfheight); + return 0; +} + +LIBYUV_API +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, + width, height); +} + +// Convert I420 to specified format +LIBYUV_API +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + int r = 0; + if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { + return -1; + } + switch (format) { + // Single plane formats + case FOURCC_YUY2: + r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); + break; + case FOURCC_UYVY: + r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); + break; + case FOURCC_RGBP: + r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); + break; + case FOURCC_RGBO: + r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_R444: + r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_BGRA: + r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_ABGR: + r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_RGBA: + r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_AR30: + r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_I400: + r = I400Copy(y, y_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, width, + height); + break; + case FOURCC_NV12: { + int dst_y_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_uv = dst_sample + dst_y_stride * height; + r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_uv, + dst_sample_stride ? dst_sample_stride : width, width, + height); + break; + } + case FOURCC_NV21: { + int dst_y_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_vu = dst_sample + dst_y_stride * height; + r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_vu, + dst_sample_stride ? dst_sample_stride : width, width, + height); + break; + } + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; + int halfheight = (height + 1) / 2; + uint8_t* dst_u; + uint8_t* dst_v; + if (format == FOURCC_YV12) { + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * halfheight; + } else { + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * halfheight; + } + r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, + width, height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; + uint8_t* dst_u; + uint8_t* dst_v; + if (format == FOURCC_YV16) { + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * height; + } else { + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * height; + } + r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, + width, height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_u; + uint8_t* dst_v; + if (format == FOURCC_YV24) { + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + dst_sample_stride * height; + } else { + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + dst_sample_stride * height; + } + r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, dst_sample_stride, dst_v, + dst_sample_stride, width, height); + break; + } + // Formats not supported - MJPG, biplanar, some rgb formats. + default: + return -1; // unknown fourcc - return failure code. + } + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc new file mode 100644 index 00000000..c3d037c4 --- /dev/null +++ b/source/convert_from_argb.cc @@ -0,0 +1,3284 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from_argb.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGB little endian (bgra in memory) to I444 +LIBYUV_API +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, + uint8_t* dst_v, int width) = ARGBToUV444Row_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u == width && dst_stride_v == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_LSX; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToUV444Row = ARGBToUV444Row_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV444Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I422 +LIBYUV_API +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +LIBYUV_API +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +LIBYUV_API +int ABGRToNV12(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = + ABGRToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYRow = ABGRToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYRow = ABGRToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ABGRToNV21(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = + ABGRToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYRow = ABGRToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYRow = ABGRToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Convert ARGB to YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; + + if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yuy2 = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_LSX; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_LASX; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); + src_argb += src_stride_argb; + dst_yuy2 += dst_stride_yuy2; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; + + if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_uyvy = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); + src_argb += src_stride_argb; + dst_uyvy += dst_stride_uyvy; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + } + return 0; +} + +// Shuffle table for converting ARGB to RGBA. +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; + +// Convert ARGB to RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, + (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); +} + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToRGB24Row_C; + if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb24 = 0; + } +#if defined(HAS_ARGBTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_LASX; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToRGB24Row = ARGBToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + int y; + void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToRAWRow_C; + if (!src_argb || !dst_raw || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_raw = 0; + } +#if defined(HAS_ARGBTORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRAWRow = ARGBToRAWRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRAWRow = ARGBToRAWRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToRAWRow = ARGBToRAWRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, +}; + +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). +LIBYUV_API +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { + int y; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565DitherRow(src_argb, dst_rgb565, + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To RGB565. +// TODO(fbarchard): Consider using dither function low level with zeros. +LIBYUV_API +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToRGB565Row_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb565 = 0; + } +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_LSX; + } + } +#endif + +#if defined(HAS_ARGBTORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565Row(src_argb, dst_rgb565, width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB1555Row_C; + if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb1555 = 0; + } +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_LSX; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB1555Row(src_argb, dst_argb1555, width); + src_argb += src_stride_argb; + dst_argb1555 += dst_stride_argb1555; + } + return 0; +} + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { + int y; + void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB4444Row_C; + if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb4444 = 0; + } +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_LSX; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB4444Row(src_argb, dst_argb4444, width); + src_argb += src_stride_argb; + dst_argb4444 += dst_stride_argb4444; + } + return 0; +} + +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = + ABGRToAR30Row_C; + if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_ar30 = 0; + } +#if defined(HAS_ABGRTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ABGRToAR30Row = ABGRToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ABGRToAR30Row(src_abgr, dst_ar30, width); + src_abgr += src_stride_abgr; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToAR30Row_C; + if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar30 = 0; + } +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR30Row = ARGBToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToAR30Row(src_argb, dst_ar30, width); + src_argb += src_stride_argb; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYJRow = ARGBToYJRow_Any_LSX; + ARGBToUVJRow = ARGBToUVJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_LSX; + ARGBToUVJRow = ARGBToUVJRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + ARGBToUVJRow = ARGBToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + ARGBToUVJRow = ARGBToUVJRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width); + ARGBToYJRow(src_argb, dst_yj, width); + ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); + src_argb += src_stride_argb * 2; + dst_yj += dst_stride_yj * 2; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + if (height & 1) { + ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); + ARGBToYJRow(src_argb, dst_yj, width); + } + return 0; +} + +// Convert ARGB to J422. (JPeg full range I422). +LIBYUV_API +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_yj == width && + dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYJRow = ARGBToYJRow_Any_LSX; + ARGBToUVJRow = ARGBToUVJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_LSX; + ARGBToUVJRow = ARGBToUVJRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + ARGBToUVJRow = ARGBToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + ARGBToUVJRow = ARGBToUVJRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + return 0; +} + +// Convert ARGB to J400. +LIBYUV_API +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert RGBA to J400. +LIBYUV_API +int RGBAToJ400(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) = + RGBAToYJRow_C; + if (!src_rgba || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } + // Coalesce rows. + if (src_stride_rgba == width * 4 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_rgba = dst_stride_yj = 0; + } +#if defined(HAS_RGBATOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToYJRow = RGBAToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_RGBATOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGBAToYJRow = RGBAToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBAToYJRow = RGBAToYJRow_AVX2; + } + } +#endif +#if defined(HAS_RGBATOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYJRow = RGBAToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYJRow = RGBAToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGBAToYJRow = RGBAToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_LSX; + } + } +#endif +#if defined(HAS_RGBATOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGBAToYJRow = RGBAToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGBAToYJRow = RGBAToYJRow_LASX; + } + } +#endif +#if defined(HAS_RGBATOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGBAToYJRow = RGBAToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RGBAToYJRow(src_rgba, dst_yj, width); + src_rgba += src_stride_rgba; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert ABGR to J420. (JPeg full range I420). +LIBYUV_API +int ABGRToJ420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ABGRToUVJRow_C; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + ABGRToUVJRow = ABGRToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + ABGRToUVJRow = ABGRToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width); + src_abgr += src_stride_abgr * 2; + dst_yj += dst_stride_yj * 2; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + if (height & 1) { + ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + } + return 0; +} + +// Convert ABGR to J422. (JPeg full range I422). +LIBYUV_API +int ABGRToJ422(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ABGRToUVJRow_C; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_yj == width && + dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + ABGRToUVJRow = ABGRToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + src_abgr += src_stride_abgr; + dst_yj += dst_stride_yj; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + return 0; +} + +// Convert ABGR to J400. +LIBYUV_API +int ABGRToJ400(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_yj = 0; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ABGRToYJRow(src_abgr, dst_yj, width); + src_abgr += src_stride_abgr; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height) { + int y; + void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAR64Row_C; + if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar64 = 0; + } +#if defined(HAS_ARGBTOAR64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR64Row = ARGBToAR64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR64Row = ARGBToAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToAR64Row = ARGBToAR64Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAR64Row(src_argb, dst_ar64, width); + src_argb += src_stride_argb; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + int y; + void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAB64Row_C; + if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ab64 = 0; + } +#if defined(HAS_ARGBTOAB64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAB64Row = ARGBToAB64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAB64Row = ARGBToAB64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToAB64Row = ARGBToAB64Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAB64Row(src_argb, dst_ab64, width); + src_argb += src_stride_argb; + dst_ab64 += dst_stride_ab64; + } + return 0; +} + +// Enabled if 1 pass is available +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \ + defined(HAS_RAWTOYJROW_RVV) +#define HAS_RAWTOYJROW +#endif + +// RAW to JNV21 full range NV21 +LIBYUV_API +int RAWToJNV21(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; +#if defined(HAS_RAWTOYJROW) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYJROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + RAWToUVJRow = RAWToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVJRow = RAWToUVJRow_Any_MSA; + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + RAWToUVJRow = RAWToUVJRow_MSA; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYJROW + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#endif // HAS_RAWTOYJROW +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } +#endif + { + // Allocate a row of uv. + align_buffer_64(row_uj, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_vj = row_uj + ((halfwidth + 31) & ~31); +#if !defined(HAS_RAWTOYJROW) + // Allocate 2 rows of ARGB. + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); + ARGBToUVJRow(row, row_size, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, 0, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVJRow(row, 0, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYJROW) + free_aligned_buffer_64(row); +#endif + free_aligned_buffer_64(row_uj); + } + return 0; +} +#undef HAS_RAWTOYJROW + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert_jpeg.cc b/source/convert_jpeg.cc new file mode 100644 index 00000000..d7556ee9 --- /dev/null +++ b/source/convert_jpeg.cc @@ -0,0 +1,602 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" + +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAVE_JPEG +struct I420Buffers { + uint8_t* y; + int y_stride; + uint8_t* u; + int u_stride; + uint8_t* v; + int v_stride; + int w; + int h; +}; + +static void JpegCopyI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI422ToI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI444ToI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI400ToI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, + dest->u_stride, dest->v, dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8_t* src_mjpg, + size_t src_size_mjpg, + int* width, + int* height) { + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret) { + *width = mjpeg_decoder.GetWidth(); + *height = mjpeg_decoder.GetHeight(); + } + mjpeg_decoder.UnloadFrame(); + return ret ? 0 : -1; // -1 for runtime failure. +} + +// MJPG (Motion JPeg) to I420 +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. +LIBYUV_API +int MJPGToI420(const uint8_t* src_mjpg, + size_t src_size_mjpg, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (src_size_mjpg == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, + dst_height); + } else { + // TODO(fbarchard): Implement conversion for any other + // colorspace/subsample factors that occur in practice. ERROR: Unable to + // convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +struct NV21Buffers { + uint8_t* y; + int y_stride; + uint8_t* vu; + int vu_stride; + int w; + int h; +}; + +static void JpegI420ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI422ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI444ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI400ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, + dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to NV21 +LIBYUV_API +int MJPGToNV21(const uint8_t* src_mjpg, + size_t src_size_mjpg, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (src_size_mjpg == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu, + dst_stride_vu, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width, + dst_height); + } else { + // Unknown colorspace. + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +static void JpegI420ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI422ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI444ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI400ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 since there is no UV plane. + I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, + dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +// MJPG (Motion JPEG) to NV12. +LIBYUV_API +int MJPGToNV12(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + // Use NV21Buffers but with UV instead of VU. + NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv, + dst_stride_uv, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width, + dst_height); + } else { + // Unknown colorspace. + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +struct ARGBBuffers { + uint8_t* argb; + int argb_stride; + int w; + int h; +}; + +static void JpegI420ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI422ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI444ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI400ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to ARGB +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. +LIBYUV_API +int MJPGToARGB(const uint8_t* src_mjpg, + size_t src_size_mjpg, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (src_size_mjpg == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, + dst_height); + } else { + // TODO(fbarchard): Implement conversion for any other + // colorspace/subsample factors that occur in practice. ERROR: Unable to + // convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +#endif // HAVE_JPEG + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert_to_argb.cc b/source/convert_to_argb.cc new file mode 100644 index 00000000..84df16c8 --- /dev/null +++ b/source/convert_to_argb.cc @@ -0,0 +1,382 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. + +// TODO(fbarchard): Add the following: +// H010ToARGB +// I010ToARGB + +LIBYUV_API +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8_t* src; + const uint8_t* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, + // and then rotate the ARGB to the final destination buffer. + // For in-place conversion, if destination dst_argb is same as source sample, + // also enable temporary buffer. + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_ARGB) || dst_argb == sample; + uint8_t* dest_argb = dst_argb; + int dest_dst_stride_argb = dst_stride_argb; + uint8_t* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + if (need_buf) { + int argb_size = crop_width * 4 * abs_crop_height; + rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + dst_argb = rotate_buffer; + dst_stride_argb = crop_width * 4; + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_ARGB: + if (!need_buf && !rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + } + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AR30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AB30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_J400: + src = sample + src_width * crop_y + crop_x; + r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = + sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = + sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_J420: { + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_H420: { + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_U420: { + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_I422: + case FOURCC_YV16: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_J422: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u = + sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_H422: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u = + sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_U422: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u = + sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_I444: + case FOURCC_YV24: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_J444: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_H444: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_U444: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, + abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } else if (rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height, rotation); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert_to_i420.cc b/source/convert_to_i420.cc new file mode 100644 index 00000000..5869ecd7 --- /dev/null +++ b/source/convert_to_i420.cc @@ -0,0 +1,280 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/convert.h" + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8_t* src; + const uint8_t* src_uv; + const int abs_src_height = (src_height < 0) ? -src_height : src_height; + // TODO(nisse): Why allow crop_height < 0? + const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && + format != FOURCC_NV21 && format != FOURCC_YV12) || + dst_y == sample; + uint8_t* tmp_y = dst_y; + uint8_t* tmp_u = dst_u; + uint8_t* tmp_v = dst_v; + int tmp_y_stride = dst_stride_y; + int tmp_u_stride = dst_stride_u; + int tmp_v_stride = dst_stride_v; + uint8_t* rotate_buffer = NULL; + const int inv_crop_height = + (src_height < 0) ? -abs_crop_height : abs_crop_height; + + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { + return -1; + } + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination dst_y is same as source sample, + // also enable temporary buffer. + if (need_buf) { + int y_size = crop_width * abs_crop_height; + int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); + rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + dst_y = rotate_buffer; + dst_u = dst_y + y_size; + dst_v = dst_u + uv_size; + dst_stride_y = crop_width; + dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix. + uint8_t* u = (crop_x & 1) ? dst_v : dst_u; + uint8_t* v = (crop_x & 1) ? dst_u : dst_v; + int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; + int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, + stride_u, v, stride_v, crop_width, inv_crop_height); + break; + } + case FOURCC_UYVY: { + uint8_t* u = (crop_x & 1) ? dst_v : dst_u; + uint8_t* v = (crop_x & 1) ? dst_u : dst_v; + int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; + int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, + stride_u, v, stride_v, crop_width, inv_crop_height); + break; + } + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + // TODO(fbarchard): Add AR30 and AB30 + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); + break; + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + // Call NV12 but with dst_u and dst_v parameters swapped. + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, crop_width, inv_crop_height, rotation); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + + (crop_x / 2); + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); + } else { + src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + + (crop_x / 2); + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); + } + r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + (crop_x / 2); + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + (crop_x / 2); + } else { + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + (crop_x / 2); + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + (crop_x / 2); + } + r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, src_width, + abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, crop_width, abs_crop_height, + rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/cpu_id.cc b/source/cpu_id.cc new file mode 100644 index 00000000..0c4a1581 --- /dev/null +++ b/source/cpu_id.cc @@ -0,0 +1,376 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/cpu_id.h" + +#if defined(_MSC_VER) +#include // For __cpuidex() +#endif +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) +#include // For _xgetbv() +#endif + +// For ArmCpuCaps() but unittested on all platforms +#include // For fopen() +#include + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// For functions that use the stack and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \ + !defined(__clang__) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + +// cpu_info_ variable for SIMD instruction sets detected. +LIBYUV_API int cpu_info_ = 0; + +// Low level cpuid for X86. +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) +LIBYUV_API +void CpuId(int info_eax, int info_ecx, int* cpu_info) { +#if defined(_MSC_VER) +// Visual C version uses intrinsic or inline x86 assembly. +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) + __cpuidex(cpu_info, info_eax, info_ecx); +#elif defined(_M_IX86) + __asm { + mov eax, info_eax + mov ecx, info_ecx + mov edi, cpu_info + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + } +#else // Visual C but not x86 + if (info_ecx == 0) { + __cpuid(cpu_info, info_eax); + } else { + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; + } +#endif +// GCC version uses inline x86 assembly. +#else // defined(_MSC_VER) + int info_ebx, info_edx; + asm volatile( +#if defined(__i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D"(info_ebx), +#else + "cpuid \n" + : "=b"(info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // defined(_MSC_VER) +} +#else // (defined(_M_IX86) || defined(_M_X64) ... +LIBYUV_API +void CpuId(int eax, int ecx, int* cpu_info) { + (void)eax; + (void)ecx; + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// For VS2010 and earlier emit can be used: +// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. +// __asm { +// xor ecx, ecx // xcr 0 +// xgetbv +// mov xcr0, eax +// } +// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. +// https://code.google.com/p/libyuv/issues/detail?id=529 +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) +#pragma optimize("g", off) +#endif +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +static int GetXCR0() { + int xcr0 = 0; +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) + xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT +#elif defined(__i386__) || defined(__x86_64__) + asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); +#endif // defined(__i386__) || defined(__x86_64__) + return xcr0; +} +#else +// xgetbv unavailable to query for OSSave support. Return 0. +#define GetXCR0() 0 +#endif // defined(_M_IX86) || defined(_M_X64) .. +// Return optimization to previous setting. +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) +#pragma optimize("g", on) +#endif + +// Based on libvpx arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "re"); + if (!f) { + // Assume Neon if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasNEON; + } + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char* p = strstr(cpuinfo_line, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + // aarch64 uses asimd for Neon. + p = strstr(cpuinfo_line, " asimd"); + if (p) { + fclose(f); + return kCpuHasNEON; + } + } + } + fclose(f); + return 0; +} + +LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + int flag = 0; + FILE* f = fopen(cpuinfo_name, "re"); + if (!f) { +#if defined(__riscv_vector) + // Assume RVV if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasRVV; +#else + return 0; +#endif + } + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { + if (memcmp(cpuinfo_line, "isa", 3) == 0) { + // ISA string must begin with rv64{i,e,g} for a 64-bit processor. + char* isa = strstr(cpuinfo_line, "rv64"); + if (isa) { + size_t isa_len = strlen(isa); + char* extensions; + size_t extensions_len = 0; + size_t std_isa_len; + // Remove the new-line character at the end of string + if (isa[isa_len - 1] == '\n') { + isa[--isa_len] = '\0'; + } + // 5 ISA characters + if (isa_len < 5) { + fclose(f); + return 0; + } + // Skip {i,e,g} canonical checking. + // Skip rvxxx + isa += 5; + // Find the very first occurrence of 's', 'x' or 'z'. + // To detect multi-letter standard, non-standard, and + // supervisor-level extensions. + extensions = strpbrk(isa, "zxs"); + if (extensions) { + // Multi-letter extensions are seperated by a single underscore + // as described in RISC-V User-Level ISA V2.2. + char* ext = strtok(extensions, "_"); + extensions_len = strlen(extensions); + while (ext) { + // Search for the ZVFH (Vector FP16) extension. + if (!strcmp(ext, "zvfh")) { + flag |= kCpuHasRVVZVFH; + } + ext = strtok(NULL, "_"); + } + } + std_isa_len = isa_len - extensions_len - 5; + // Detect the v in the standard single-letter extensions. + if (memchr(isa, 'v', std_isa_len)) { + // The RVV implied the F extension. + flag |= kCpuHasRVV; + } + } + } +#if defined(__riscv_vector) + // Assume RVV if /proc/cpuinfo is from x86 host running QEMU. + else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) || + (memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) { + fclose(f); + return kCpuHasRVV; + } +#endif + } + fclose(f); + return flag; +} + +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + int flag = 0; + FILE* f = fopen(cpuinfo_name, "re"); + if (!f) { + // Assume nothing if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return 0; + } + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { + if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + // Workaround early kernel without MSA in ASEs line. + if (strstr(cpuinfo_line, "Loongson-2K")) { + flag |= kCpuHasMSA; + } + } + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + if (strstr(cpuinfo_line, "msa")) { + flag |= kCpuHasMSA; + } + // ASEs is the last line, so we can break here. + break; + } + } + fclose(f); + return flag; +} + +#define LOONGARCH_CFG2 0x2 +#define LOONGARCH_CFG2_LSX (1 << 6) +#define LOONGARCH_CFG2_LASX (1 << 7) + +#if defined(__loongarch__) +LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) { + int flag = 0; + uint32_t cfg2 = 0; + + __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2)); + + if (cfg2 & LOONGARCH_CFG2_LSX) + flag |= kCpuHasLSX; + + if (cfg2 & LOONGARCH_CFG2_LASX) + flag |= kCpuHasLASX; + return flag; +} +#endif + +static SAFEBUFFERS int GetCpuFlags(void) { + int cpu_info = 0; +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86)) + int cpu_info0[4] = {0, 0, 0, 0}; + int cpu_info1[4] = {0, 0, 0, 0}; + int cpu_info7[4] = {0, 0, 0, 0}; + CpuId(0, 0, cpu_info0); + CpuId(1, 0, cpu_info1); + if (cpu_info0[0] >= 7) { + CpuId(7, 0, cpu_info7); + } + cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); + + // AVX requires OS saves YMM registers. + if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave + ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers + cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); + + // Detect AVX512bw + if ((GetXCR0() & 0xe0) == 0xe0) { + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; + } + } +#endif +#if defined(__mips__) && defined(__linux__) + cpu_info = MipsCpuCaps("/proc/cpuinfo"); + cpu_info |= kCpuHasMIPS; +#endif +#if defined(__loongarch__) && defined(__linux__) + cpu_info = LoongarchCpuCaps(); + cpu_info |= kCpuHasLOONGARCH; +#endif +#if defined(__arm__) || defined(__aarch64__) +// gcc -mfpu=neon defines __ARM_NEON__ +// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. +// For Linux, /proc/cpuinfo can be tested but without that assume Neon. +#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) + cpu_info = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#endif +#if defined(__aarch64__) + cpu_info = kCpuHasNEON; +#else + // Linux arm parse text file for neon detect. + cpu_info = ArmCpuCaps("/proc/cpuinfo"); +#endif + cpu_info |= kCpuHasARM; +#endif // __arm__ +#if defined(__riscv) && defined(__linux__) + cpu_info = RiscvCpuCaps("/proc/cpuinfo"); + cpu_info |= kCpuHasRISCV; +#endif // __riscv + cpu_info |= kCpuInitialized; + return cpu_info; +} + +// Note that use of this function is not thread safe. +LIBYUV_API +int MaskCpuFlags(int enable_flags) { + int cpu_info = GetCpuFlags() & enable_flags; + SetCpuFlags(cpu_info); + return cpu_info; +} + +LIBYUV_API +int InitCpuFlags(void) { + return MaskCpuFlags(-1); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc new file mode 100644 index 00000000..0141da8a --- /dev/null +++ b/source/mjpeg_decoder.cc @@ -0,0 +1,581 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef HAVE_JPEG +#include + +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +// Must be included before jpeglib. +#include +#define HAVE_SETJMP + +#if defined(_MSC_VER) +// disable warning 4324: structure was padded due to __declspec(align()) +#pragma warning(disable : 4324) +#endif + +#endif + +#include // For jpeglib.h. + +// C++ build requires extern C for jpeg internals. +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#ifdef __cplusplus +} // extern "C" +#endif + +#include "libyuv/planar_functions.h" // For CopyPlane(). + +namespace libyuv { + +#ifdef HAVE_SETJMP +struct SetJmpErrorMgr { + jpeg_error_mgr base; // Must be at the top + jmp_buf setjmp_buffer; +}; +#endif + +const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; +const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; +const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; +const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; +const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; +const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; + +// Methods that are passed to jpeglib. +boolean fill_input_buffer(jpeg_decompress_struct* cinfo); +void init_source(jpeg_decompress_struct* cinfo); +void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT +void term_source(jpeg_decompress_struct* cinfo); +void ErrorHandler(jpeg_common_struct* cinfo); +void OutputHandler(jpeg_common_struct* cinfo); + +MJpegDecoder::MJpegDecoder() + : has_scanline_padding_(LIBYUV_FALSE), + num_outbufs_(0), + scanlines_(NULL), + scanlines_sizes_(NULL), + databuf_(NULL), + databuf_strides_(NULL) { + decompress_struct_ = new jpeg_decompress_struct; + source_mgr_ = new jpeg_source_mgr; +#ifdef HAVE_SETJMP + error_mgr_ = new SetJmpErrorMgr; + decompress_struct_->err = jpeg_std_error(&error_mgr_->base); + // Override standard exit()-based error handler. + error_mgr_->base.error_exit = &ErrorHandler; + error_mgr_->base.output_message = &OutputHandler; +#endif + decompress_struct_->client_data = NULL; + source_mgr_->init_source = &init_source; + source_mgr_->fill_input_buffer = &fill_input_buffer; + source_mgr_->skip_input_data = &skip_input_data; + source_mgr_->resync_to_restart = &jpeg_resync_to_restart; + source_mgr_->term_source = &term_source; + jpeg_create_decompress(decompress_struct_); + decompress_struct_->src = source_mgr_; + buf_vec_.buffers = &buf_; + buf_vec_.len = 1; +} + +MJpegDecoder::~MJpegDecoder() { + jpeg_destroy_decompress(decompress_struct_); + delete decompress_struct_; + delete source_mgr_; +#ifdef HAVE_SETJMP + delete error_mgr_; +#endif + DestroyOutputBuffers(); +} + +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { + if (!ValidateJpeg(src, src_len)) { + return LIBYUV_FALSE; + } + + buf_.data = src; + buf_.len = (int)src_len; + buf_vec_.pos = 0; + decompress_struct_->client_data = &buf_vec_; +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_read_header, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { + // ERROR: Bad MJPEG header + return LIBYUV_FALSE; + } + AllocOutputBuffers(GetNumComponents()); + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_size = GetComponentScanlinesPerImcuRow(i); + if (scanlines_sizes_[i] != scanlines_size) { + if (scanlines_[i]) { + delete scanlines_[i]; + } + scanlines_[i] = new uint8_t*[scanlines_size]; + scanlines_sizes_[i] = scanlines_size; + } + + // We allocate padding for the final scanline to pad it up to DCTSIZE bytes + // to avoid memory errors, since jpeglib only reads full MCUs blocks. For + // the preceding scanlines, the padding is not needed/wanted because the + // following addresses will already be valid (they are the initial bytes of + // the next scanline) and will be overwritten when jpeglib writes out that + // next scanline. + int databuf_stride = GetComponentStride(i); + int databuf_size = scanlines_size * databuf_stride; + if (databuf_strides_[i] != databuf_stride) { + if (databuf_[i]) { + delete databuf_[i]; + } + databuf_[i] = new uint8_t[databuf_size]; + databuf_strides_[i] = databuf_stride; + } + + if (GetComponentStride(i) != GetComponentWidth(i)) { + has_scanline_padding_ = LIBYUV_TRUE; + } + } + return LIBYUV_TRUE; +} + +static int DivideAndRoundUp(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} + +static int DivideAndRoundDown(int numerator, int denominator) { + return numerator / denominator; +} + +// Returns width of the last loaded frame. +int MJpegDecoder::GetWidth() { + return decompress_struct_->image_width; +} + +// Returns height of the last loaded frame. +int MJpegDecoder::GetHeight() { + return decompress_struct_->image_height; +} + +// Returns format of the last loaded frame. The return value is one of the +// kColorSpace* constants. +int MJpegDecoder::GetColorSpace() { + return decompress_struct_->jpeg_color_space; +} + +// Number of color components in the color space. +int MJpegDecoder::GetNumComponents() { + return decompress_struct_->num_components; +} + +// Sample factors of the n-th component. +int MJpegDecoder::GetHorizSampFactor(int component) { + return decompress_struct_->comp_info[component].h_samp_factor; +} + +int MJpegDecoder::GetVertSampFactor(int component) { + return decompress_struct_->comp_info[component].v_samp_factor; +} + +int MJpegDecoder::GetHorizSubSampFactor(int component) { + return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); +} + +int MJpegDecoder::GetVertSubSampFactor(int component) { + return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); +} + +int MJpegDecoder::GetImageScanlinesPerImcuRow() { + return decompress_struct_->max_v_samp_factor * DCTSIZE; +} + +int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); +} + +int MJpegDecoder::GetComponentWidth(int component) { + int hs = GetHorizSubSampFactor(component); + return DivideAndRoundUp(GetWidth(), hs); +} + +int MJpegDecoder::GetComponentHeight(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetHeight(), vs); +} + +// Get width in bytes padded out to a multiple of DCTSIZE +int MJpegDecoder::GetComponentStride(int component) { + return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); +} + +int MJpegDecoder::GetComponentSize(int component) { + return GetComponentWidth(component) * GetComponentHeight(component); +} + +LIBYUV_BOOL MJpegDecoder::UnloadFrame() { +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_abort_decompress, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // Compute amount of lines to skip to implement vertical crop. + // TODO(fbarchard): Ensure skip is a multiple of maximum component + // subsample. ie 2 + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + // There is no API to skip lines in the output data, so we read them + // into the temp buffer. + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. Must read it and then + // copy the parts we want into the destination. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = + GetComponentScanlinesPerImcuRow(i) - rows_to_skip; + int data_to_skip = rows_to_skip * GetComponentStride(i); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), + scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + lines_left -= (GetImageScanlinesPerImcuRow() - skip); + } + } + + // Read full MCUs but cropped horizontally + for (; lines_left > GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = + DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + return FinishDecode(); +} + +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + // Change our own data buffer pointers so we can pass them to the + // callback. + databuf_[i] += data_to_skip; + } + int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; + (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); + // Now change them back. + for (int i = 0; i < num_outbufs_; ++i) { + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + databuf_[i] -= data_to_skip; + } + lines_left -= scanlines_to_copy; + } + } + // Read full MCUs until we get to the crop point. + for (; lines_left >= GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); + } + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, lines_left); + } + return FinishDecode(); +} + +void init_source(j_decompress_ptr cinfo) { + fill_input_buffer(cinfo); +} + +boolean fill_input_buffer(j_decompress_ptr cinfo) { + BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); + if (buf_vec->pos >= buf_vec->len) { + // ERROR: No more data + return FALSE; + } + cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; + cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; + ++buf_vec->pos; + return TRUE; +} + +void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT + jpeg_source_mgr* src = cinfo->src; + size_t bytes = (size_t)num_bytes; + if (bytes > src->bytes_in_buffer) { + src->next_input_byte = nullptr; + src->bytes_in_buffer = 0; + } else { + src->next_input_byte += bytes; + src->bytes_in_buffer -= bytes; + } +} + +void term_source(j_decompress_ptr cinfo) { + (void)cinfo; // Nothing to do. +} + +#ifdef HAVE_SETJMP +void ErrorHandler(j_common_ptr cinfo) { +// This is called when a jpeglib command experiences an error. Unfortunately +// jpeglib's error handling model is not very flexible, because it expects the +// error handler to not return--i.e., it wants the program to terminate. To +// recover from errors we use setjmp() as shown in their example. setjmp() is +// C's implementation for the "call with current continuation" functionality +// seen in some functional programming languages. +// A formatted message can be output, but is unsafe for release. +#ifdef DEBUG + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); +// ERROR: Error in jpeglib: buf +#endif + + SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); + // This rewinds the call stack to the point of the corresponding setjmp() + // and causes it to return (for a second time) with value 1. + longjmp(mgr->setjmp_buffer, 1); +} + +// Suppress fprintf warnings. +void OutputHandler(j_common_ptr cinfo) { + (void)cinfo; +} + +#endif // HAVE_SETJMP + +void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { + if (num_outbufs != num_outbufs_) { + // We could perhaps optimize this case to resize the output buffers without + // necessarily having to delete and recreate each one, but it's not worth + // it. + DestroyOutputBuffers(); + + scanlines_ = new uint8_t**[num_outbufs]; + scanlines_sizes_ = new int[num_outbufs]; + databuf_ = new uint8_t*[num_outbufs]; + databuf_strides_ = new int[num_outbufs]; + + for (int i = 0; i < num_outbufs; ++i) { + scanlines_[i] = NULL; + scanlines_sizes_[i] = 0; + databuf_[i] = NULL; + databuf_strides_[i] = 0; + } + + num_outbufs_ = num_outbufs; + } +} + +void MJpegDecoder::DestroyOutputBuffers() { + for (int i = 0; i < num_outbufs_; ++i) { + delete[] scanlines_[i]; + delete[] databuf_[i]; + } + delete[] scanlines_; + delete[] databuf_; + delete[] scanlines_sizes_; + delete[] databuf_strides_; + scanlines_ = NULL; + databuf_ = NULL; + scanlines_sizes_ = NULL; + databuf_strides_ = NULL; + num_outbufs_ = 0; +} + +// JDCT_IFAST and do_block_smoothing improve performance substantially. +LIBYUV_BOOL MJpegDecoder::StartDecode() { + decompress_struct_->raw_data_out = TRUE; + decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default + decompress_struct_->dither_mode = JDITHER_NONE; + // Not applicable to 'raw': + decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); + // Only for buffered mode: + decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); + // Blocky but fast: + decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); + + if (!jpeg_start_decompress(decompress_struct_)) { + // ERROR: Couldn't start JPEG decompressor"; + return LIBYUV_FALSE; + } + return LIBYUV_TRUE; +} + +LIBYUV_BOOL MJpegDecoder::FinishDecode() { + // jpeglib considers it an error if we finish without decoding the whole + // image, so we call "abort" rather than "finish". + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +void MJpegDecoder::SetScanlinePointers(uint8_t** data) { + for (int i = 0; i < num_outbufs_; ++i) { + uint8_t* data_i = data[i]; + for (int j = 0; j < scanlines_sizes_[i]; ++j) { + scanlines_[i][j] = data_i; + data_i += GetComponentStride(i); + } + } +} + +inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { + return (unsigned int)(GetImageScanlinesPerImcuRow()) == + jpeg_read_raw_data(decompress_struct_, scanlines_, + GetImageScanlinesPerImcuRow()); +} + +// The helper function which recognizes the jpeg sub-sampling type. +JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( + int* subsample_x, + int* subsample_y, + int number_of_components) { + if (number_of_components == 3) { // Color images. + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { + return kJpegYuv420; + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { + return kJpegYuv422; + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && + subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { + return kJpegYuv444; + } + } else if (number_of_components == 1) { // Grey-scale images. + if (subsample_x[0] == 1 && subsample_y[0] == 1) { + return kJpegYuv400; + } + } + return kJpegUnknown; +} + +} // namespace libyuv +#endif // HAVE_JPEG diff --git a/source/mjpeg_validate.cc b/source/mjpeg_validate.cc new file mode 100644 index 00000000..ba0a03ab --- /dev/null +++ b/source/mjpeg_validate.cc @@ -0,0 +1,71 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#include // For memchr. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Helper function to scan for EOI marker (0xff 0xd9). +static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) { + if (src_size_mjpg >= 2) { + const uint8_t* end = src_mjpg + src_size_mjpg - 1; + const uint8_t* it = src_mjpg; + while (it < end) { + // TODO(fbarchard): scan for 0xd9 instead. + it = (const uint8_t*)(memchr(it, 0xff, end - it)); + if (it == NULL) { + break; + } + if (it[1] == 0xd9) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + ++it; // Skip over current 0xff. + } + } + // ERROR: Invalid jpeg end code not found. Size src_size_mjpg + return LIBYUV_FALSE; +} + +// Helper function to validate the jpeg appears intact. +LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) { + // Maximum size that ValidateJpeg will consider valid. + const size_t kMaxJpegSize = 0x7fffffffull; + const size_t kBackSearchSize = 1024; + if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) { + // ERROR: Invalid jpeg size: src_size_mjpg + return LIBYUV_FALSE; + } + // SOI marker + if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) { + // ERROR: Invalid jpeg initial start code + return LIBYUV_FALSE; + } + + // Look for the End Of Image (EOI) marker near the end of the buffer. + if (src_size_mjpg > kBackSearchSize) { + if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + // Reduce search size for forward search. + src_size_mjpg = src_size_mjpg - kBackSearchSize + 1; + } + // Step over SOI marker and scan for EOI. + return ScanEOI(src_mjpg + 2, src_size_mjpg - 2); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/planar_functions.cc b/source/planar_functions.cc new file mode 100644 index 00000000..d115a2a1 --- /dev/null +++ b/source/planar_functions.cc @@ -0,0 +1,5715 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/planar_functions.h" + +#include +#include // for memset() + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/row.h" +#include "libyuv/scale_row.h" // for ScaleRowDown2 + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } + +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +LIBYUV_API +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height) { + CopyPlane((const uint8_t*)src_y, src_stride_y * 2, (uint8_t*)dst_y, + dst_stride_y * 2, width * 2, height); +} + +// Convert a plane of 16 bit data to 8 bit +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, + int width) = Convert16To8Row_C; + + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Convert16To8Row = Convert16To8Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_NEON; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert16To8Row = Convert16To8Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert16To8Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert a plane of 8 bit data to 16 bit +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 1024 for 10 bits + int width, + int height) { + int y; + void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, + int width) = Convert8To16Row_C; + + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT8TO16ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Convert8To16Row = Convert8To16Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + Convert8To16Row = Convert8To16Row_SSE2; + } + } +#endif +#if defined(HAS_CONVERT8TO16ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert8To16Row = Convert8To16Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert8To16Row = Convert8To16Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert8To16Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Copy I422. +LIBYUV_API +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + +// Copy I444. +LIBYUV_API +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + +// Copy I210. +LIBYUV_API +int I210Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + +// Copy I410. +LIBYUV_API +int I410Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + +// Copy I400. +LIBYUV_API +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Convert I420 to I400. +LIBYUV_API +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + (void)src_u; + (void)src_stride_u; + (void)src_v; + (void)src_stride_v; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Copy NV12. Supports inverting. +LIBYUV_API +int NV12Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2, + halfheight); + return 0; +} + +// Copy NV21. Supports inverting. +LIBYUV_API +int NV21Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_vu, dst_stride_vu, width, height); +} + +// Support function for NV12 etc UV channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, + int width) = SplitUVRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_SPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SplitUVRow = SplitUVRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_LSX; + } + } +#endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } +} + +LIBYUV_API +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uv = dst_uv + (height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + // Coalesce rows. + if (src_stride_u == width && src_stride_v == width && + dst_stride_uv == width * 2) { + width *= height; + height = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow = MergeUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow(src_u, src_v, dst_uv, width); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } +} + +// Support function for P010 etc UV channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitUVPlane_16(const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + int y; + void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u, + uint16_t* dst_v, int depth, int width) = + SplitUVRow_16_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow_16 = SplitUVRow_16_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow_16 = SplitUVRow_16_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow_16 = SplitUVRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SplitUVRow_16 = SplitUVRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of UV. + SplitUVRow_16(src_uv, dst_u, dst_v, depth, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } +} + +LIBYUV_API +void MergeUVPlane_16(const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int depth) { + int y; + void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v, + uint16_t* dst_uv, int depth, int width) = + MergeUVRow_16_C; + assert(depth >= 8); + assert(depth <= 16); + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uv = dst_uv + (height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + // Coalesce rows. + if (src_stride_u == width && src_stride_v == width && + dst_stride_uv == width * 2) { + width *= height; + height = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_16 = MergeUVRow_16_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + MergeUVRow_16 = MergeUVRow_16_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_16 = MergeUVRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeUVRow_16 = MergeUVRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow_16(src_u, src_v, dst_uv, depth, width); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } +} + +// Convert plane from lsb to msb +LIBYUV_API +void ConvertToMSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth) { + int y; + int scale = 1 << (16 - depth); + void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = MultiplyRow_16_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + +#if defined(HAS_MULTIPLYROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MultiplyRow_16 = MultiplyRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MultiplyRow_16 = MultiplyRow_16_AVX2; + } + } +#endif +#if defined(HAS_MULTIPLYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MultiplyRow_16 = MultiplyRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MultiplyRow_16 = MultiplyRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MultiplyRow_16(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert plane from msb to lsb +LIBYUV_API +void ConvertToLSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth) { + int y; + int scale = 1 << depth; + void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = DivideRow_16_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + +#if defined(HAS_DIVIDEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + DivideRow = DivideRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + DivideRow = DivideRow_16_AVX2; + } + } +#endif +#if defined(HAS_DIVIDEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DivideRow = DivideRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DivideRow = DivideRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + DivideRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Swap U and V channels in interleaved UV plane. +LIBYUV_API +void SwapUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = + SwapUVRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) { + width *= height; + height = 1; + src_stride_uv = dst_stride_vu = 0; + } + +#if defined(HAS_SWAPUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SwapUVRow = SwapUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SwapUVRow = SwapUVRow_SSSE3; + } + } +#endif +#if defined(HAS_SWAPUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SwapUVRow = SwapUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SwapUVRow = SwapUVRow_AVX2; + } + } +#endif +#if defined(HAS_SWAPUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SwapUVRow = SwapUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SwapUVRow = SwapUVRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + SwapUVRow(src_uv, dst_vu, width); + src_uv += src_stride_uv; + dst_vu += dst_stride_vu; + } +} + +// Convert NV21 to NV12. +LIBYUV_API +int NV21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src_vu || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_vu = src_vu + (halfheight - 1) * src_stride_vu; + src_stride_vu = -src_stride_vu; + } + + SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth, + halfheight); + return 0; +} + +// Test if tile_height is a power of 2 (16 or 32) +#define IS_POWEROFTWO(x) (!((x) & ((x)-1))) + +// Detile a plane of data +// tile width is 16 and assumed. +// tile_height is 16 or 32 for MM21. +// src_stride_y is bytes per row of source ignoring tiling. e.g. 640 +// TODO: More detile row functions. +LIBYUV_API +int DetilePlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, + int width) = DetileRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0 || + !IS_POWEROFTWO(tile_height)) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + +#if defined(HAS_DETILEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileRow = DetileRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileRow = DetileRow_SSE2; + } + } +#endif +#if defined(HAS_DETILEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileRow = DetileRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileRow = DetileRow_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileRow(src_y, src_tile_stride, dst_y, width); + dst_y += dst_stride_y; + src_y += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_tile_stride + src_stride_y * tile_height; + } + } + return 0; +} + +// Convert a plane of 16 bit tiles of 16 x H to linear. +// tile width is 16 and assumed. +// tile_height is 16 or 32 for MT2T. +LIBYUV_API +int DetilePlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride, + uint16_t* dst, int width) = DetileRow_16_C; + if (!src_y || !dst_y || width <= 0 || height == 0 || + !IS_POWEROFTWO(tile_height)) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + +#if defined(HAS_DETILEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileRow_16 = DetileRow_16_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_SSE2; + } + } +#endif +#if defined(HAS_DETILEROW_16_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + DetileRow_16 = DetileRow_16_Any_AVX; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_AVX; + } + } +#endif +#if defined(HAS_DETILEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileRow_16 = DetileRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileRow_16(src_y, src_tile_stride, dst_y, width); + dst_y += dst_stride_y; + src_y += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_tile_stride + src_stride_y * tile_height; + } + } + return 0; +} + +LIBYUV_API +void DetileSplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride, + uint8_t* dst_u, uint8_t* dst_v, int width) = + DetileSplitUVRow_C; + assert(src_stride_uv >= 0); + assert(tile_height > 0); + assert(src_stride_uv > 0); + + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_stride_u = -dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_v = -dst_stride_v; + } + +#if defined(HAS_DETILESPLITUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + DetileSplitUVRow = DetileSplitUVRow_SSSE3; + } + } +#endif +#if defined(HAS_DETILESPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileSplitUVRow = DetileSplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileSplitUVRow = DetileSplitUVRow_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height; + } + } +} + +LIBYUV_API +void DetileToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height, + int tile_height) { + const ptrdiff_t src_y_tile_stride = 16 * tile_height; + const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2; + int y; + void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, int width) = DetileToYUY2_C; + assert(src_stride_y >= 0); + assert(src_stride_y > 0); + assert(src_stride_uv >= 0); + assert(src_stride_uv > 0); + assert(tile_height > 0); + + if (width <= 0 || height == 0 || tile_height <= 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + +#if defined(HAS_DETILETOYUY2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileToYUY2 = DetileToYUY2_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_NEON; + } + } +#endif + +#if defined(HAS_DETILETOYUY2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileToYUY2 = DetileToYUY2_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_SSE2; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, + width); + dst_yuy2 += dst_stride_yuy2; + src_y += 16; + + if (y & 0x1) + src_uv += 16; + + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_y_tile_stride + src_stride_y * tile_height; + src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2); + } + } +} + +// Support function for NV12 etc RGB channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int y; + void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitRGBRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + } + // Coalesce rows. + if (src_stride_rgb == width * 3 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } +#if defined(HAS_SPLITRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitRGBRow = SplitRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitRGBRow = SplitRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_NEON; + } + } +#endif +#if defined(HAS_SPLITRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitRGBRow = SplitRGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of RGB. + SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_rgb += src_stride_rgb; + } +} + +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height) { + int y; + void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_rgb, int width) = + MergeRGBRow_C; + if (width <= 0 || height == 0) { + return; + } + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_rgb == width * 3) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; + } +#if defined(HAS_MERGERGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MergeRGBRow = MergeRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_MERGERGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeRGBRow = MergeRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_NEON; + } + } +#endif +#if defined(HAS_MERGERGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeRGBRow = MergeRGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of RGB. + MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_rgb += dst_stride_rgb; + } +} + +LIBYUV_NOINLINE +static void SplitARGBPlaneAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int y; + void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, uint8_t* dst_a, int width) = + SplitARGBRow_C; + + assert(height > 0); + + if (width <= 0 || height == 0) { + return; + } + if (src_stride_argb == width * 4 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = + dst_stride_a = 0; + } + +#if defined(HAS_SPLITARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitARGBRow = SplitARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + SplitARGBRow = SplitARGBRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitARGBRow = SplitARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + SplitARGBRow = SplitARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitARGBRow = SplitARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitARGBRow = SplitARGBRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitARGBRow = SplitARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitARGBRow = SplitARGBRow_NEON; + } + } +#endif +#if defined(HAS_SPLITARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitARGBRow = SplitARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + dst_a += dst_stride_a; + src_argb += src_stride_argb; + } +} + +LIBYUV_NOINLINE +static void SplitARGBPlaneOpaque(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int y; + void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitXRGBRow_C; + assert(height > 0); + + if (width <= 0 || height == 0) { + return; + } + if (src_stride_argb == width * 4 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } + +#if defined(HAS_SPLITXRGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitXRGBRow = SplitXRGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + SplitXRGBRow = SplitXRGBRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitXRGBRow = SplitXRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + SplitXRGBRow = SplitXRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitXRGBRow = SplitXRGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitXRGBRow = SplitXRGBRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitXRGBRow = SplitXRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitXRGBRow = SplitXRGBRow_NEON; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitXRGBRow = SplitXRGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_argb += src_stride_argb; + } +} + +LIBYUV_API +void SplitARGBPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_a = dst_a + (height - 1) * dst_stride_a; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + dst_stride_a = -dst_stride_a; + } + + if (dst_a == NULL) { + SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, + dst_stride_g, dst_b, dst_stride_b, width, height); + } else { + SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, + dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a, + width, height); + } +} + +LIBYUV_NOINLINE +static void MergeARGBPlaneAlpha(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, const uint8_t* src_a, + uint8_t* dst_argb, int width) = MergeARGBRow_C; + + assert(height > 0); + + if (width <= 0 || height == 0) { + return; + } + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeARGBRow = MergeARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + MergeARGBRow = MergeARGBRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeARGBRow = MergeARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGBRow = MergeARGBRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGBRow = MergeARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeARGBRow = MergeARGBRow_NEON; + } + } +#endif +#if defined(HAS_MERGEARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeARGBRow = MergeARGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_NOINLINE +static void MergeARGBPlaneOpaque(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_argb, int width) = + MergeXRGBRow_C; + + assert(height > 0); + + if (width <= 0 || height == 0) { + return; + } + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeXRGBRow = MergeXRGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + MergeXRGBRow = MergeXRGBRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEXRGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGBRow = MergeXRGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGBRow = MergeXRGBRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEXRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGBRow = MergeXRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeXRGBRow = MergeXRGBRow_NEON; + } + } +#endif +#if defined(HAS_MERGEXRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeXRGBRow = MergeXRGBRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_API +void MergeARGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + if (src_a == NULL) { + MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_argb, dst_stride_argb, width, + height); + } else { + MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_argb, + dst_stride_argb, width, height); + } +} + +// TODO(yuan): Support 2 bit alpha channel. +LIBYUV_API +void MergeXR30Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height, + int depth) { + int y; + void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_ar30, int depth, + int width) = MergeXR30Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0; + } +#if defined(HAS_MERGEXR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR30Row = MergeXR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR30Row = MergeXR30Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (depth == 10) { + MergeXR30Row = MergeXR30Row_10_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_10_NEON; + } + } else { + MergeXR30Row = MergeXR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_NEON; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar30 += dst_stride_ar30; + } +} + +LIBYUV_NOINLINE +static void MergeAR64PlaneAlpha(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint16_t* dst_argb, int depth, int width) = + MergeAR64Row_C; + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeAR64Row = MergeAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeAR64Row = MergeAR64Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeAR64Row = MergeAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeAR64Row = MergeAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_ar64 += dst_stride_ar64; + } +} + +LIBYUV_NOINLINE +static void MergeAR64PlaneOpaque(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint16_t* dst_argb, int depth, + int width) = MergeXR64Row_C; + + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEXR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR64Row = MergeXR64Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR64Row = MergeXR64Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXR64Row = MergeXR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR64Row = MergeXR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar64 += dst_stride_ar64; + } +} + +LIBYUV_API +void MergeAR64Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64; + dst_stride_ar64 = -dst_stride_ar64; + } + + if (src_a == NULL) { + MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_ar64, dst_stride_ar64, width, height, + depth); + } else { + MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_ar64, + dst_stride_ar64, width, height, depth); + } +} + +LIBYUV_NOINLINE +static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint8_t* dst_argb, int depth, int width) = + MergeARGB16To8Row_C; + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGB16To8Row = MergeARGB16To8Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEARGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeARGB16To8Row = MergeARGB16To8Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_NOINLINE +static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_argb, int depth, + int width) = MergeXRGB16To8Row_C; + + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXRGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_API +void MergeARGB16To8Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + if (src_a == NULL) { + MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_argb, dst_stride_argb, width, + height, depth); + } else { + MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_argb, + dst_stride_argb, width, height, depth); + } +} + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, + uint8_t* dst_v, int width) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + YUY2ToUV422Row = YUY2ToUV422Row_LSX; + } + } +#endif +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + YUY2ToUV422Row = YUY2ToUV422Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, + uint8_t* dst_v, int width) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUV422Row = UYVYToUV422Row_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } + } +#endif +#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUV422Row = UYVYToUV422Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUV422Row = UYVYToUV422Row_LSX; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + UYVYToYRow = UYVYToYRow_Any_LASX; + UYVYToUV422Row = UYVYToUV422Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_LASX; + UYVYToUV422Row = UYVYToUV422Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert YUY2 to Y. +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + +// Convert UYVY to Y. +LIBYUV_API +int UYVYToY(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + } + } +#endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + } + } +#endif + + for (y = 0; y < height; ++y) { + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + } + return 0; +} + +// Mirror a plane of data. +// See Also I400Mirror +LIBYUV_API +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif +#if defined(HAS_MIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorRow = MirrorRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_LSX; + } + } +#endif +#if defined(HAS_MIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorRow = MirrorRow_Any_LASX; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_LASX; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Mirror a plane of UV data. +LIBYUV_API +void MirrorUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) = + MirrorUVRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorUVRow = MirrorUVRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorUVRow = MirrorUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorUVRow = MirrorUVRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorUVRow = MirrorUVRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_MSA; + } + } +#endif +#if defined(HAS_MIRRORUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorUVRow = MirrorUVRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_LSX; + } + } +#endif +#if defined(HAS_MIRRORUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorUVRow = MirrorUVRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_LASX; + } + } +#endif + + // MirrorUV plane + for (y = 0; y < height; ++y) { + MirrorUVRow(src_uv, dst_uv, width); + src_uv += src_stride_uv; + dst_uv += dst_stride_uv; + } +} + +// Mirror I400 with optional flipping +LIBYUV_API +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror I420 with optional flipping +LIBYUV_API +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// NV12 mirror. +LIBYUV_API +int NV12Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth, + halfheight); + return 0; +} + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = + ARGBMirrorRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_LSX; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_LASX; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + ARGBMirrorRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// RGB24 mirror. +LIBYUV_API +int RGB24Mirror(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = + RGB24MirrorRow_C; + if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } +#if defined(HAS_RGB24MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24MirrorRow = RGB24MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_NEON; + } + } +#endif +#if defined(HAS_RGB24MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_SSSE3; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + RGB24MirrorRow(src_rgb24, dst_rgb24, width); + src_rgb24 += src_stride_rgb24; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Get a blender that optimized for the CPU and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +LIBYUV_API +ARGBBlendRow GetARGBBlend() { + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + return ARGBBlendRow; + } +#endif +#if defined(HAS_ARGBBLENDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBBlendRow = ARGBBlendRow_NEON; + } +#endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } +#endif +#if defined(HAS_ARGBBLENDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBBlendRow = ARGBBlendRow_LSX; + } +#endif + return ARGBBlendRow; +} + +// Alpha Blend 2 ARGB images and store to destination. +LIBYUV_API +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = GetARGBBlend(); + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } + + for (y = 0; y < height; ++y) { + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Alpha Blend plane and store to destination. +LIBYUV_API +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; + if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Coalesce rows for Y plane. + if (src_stride_y0 == width && src_stride_y1 == width && + alpha_stride == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; + } + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); + src_y0 += src_stride_y0; + src_y1 += src_stride_y1; + alpha += alpha_stride; + dst_y += dst_stride_y; + } + return 0; +} + +#define MAXTWIDTH 2048 +// Alpha Blend YUV images and store to destination. +LIBYUV_API +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + // Half width/height for UV. + int halfwidth = (width + 1) >> 1; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + + if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || + !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Blend Y plane. + BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, + dst_y, dst_stride_y, width, height); + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(halfwidth, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + if (!IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_C; + } +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_SSSE3; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + ScaleRowDown2 = ScaleRowDown2Box_AVX2; + } + } + } +#endif + + // Row buffer for intermediate alpha pixels. + align_buffer_64(halfalpha, halfwidth); + for (y = 0; y < height; y += 2) { + // last row of odd height image use 1 row of alpha instead of 2. + if (y == (height - 1)) { + alpha_stride = 0; + } + // Subsample 2 rows of UV to half width and half height. + ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth); + alpha += alpha_stride * 2; + BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth); + BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth); + src_u0 += src_stride_u0; + src_u1 += src_stride_u1; + dst_u += dst_stride_u; + src_v0 += src_stride_v0; + src_v1 += src_stride_v1; + dst_v += dst_stride_v; + } + free_aligned_buffer_64(halfalpha); + return 0; +} + +// Multiply 2 ARGB images and store to destination. +LIBYUV_API +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBMultiplyRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_LSX; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_LASX; + } + } +#endif + + // Multiply plane + for (y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Add 2 ARGB images and store to destination. +LIBYUV_API +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, + int width) = ARGBAddRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_SSE2; + } +#endif +#if defined(HAS_ARGBADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAddRow = ARGBAddRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAddRow = ARGBAddRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_NEON; + } + } +#endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif +#if defined(HAS_ARGBADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAddRow = ARGBAddRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_LSX; + } + } +#endif +#if defined(HAS_ARGBADDROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAddRow = ARGBAddRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_LASX; + } + } +#endif + + // Add plane + for (y = 0; y < height; ++y) { + ARGBAddRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Subtract 2 ARGB images and store to destination. +LIBYUV_API +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBSubtractRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSUBTRACTROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSubtractRow = ARGBSubtractRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_MSA; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBSubtractRow = ARGBSubtractRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_LSX; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBSubtractRow = ARGBSubtractRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_LASX; + } + } +#endif + + // Subtract plane + for (y = 0; y < height; ++y) { + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to RGB24. +LIBYUV_API +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = + RAWToRGB24Row_C; + if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_raw = dst_stride_rgb24 = 0; + } +#if defined(HAS_RAWTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + RAWToRGB24Row = RAWToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_RAWTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToRGB24Row = RAWToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToRGB24Row = RAWToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_RAWTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_RAWTORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToRGB24Row = RAWToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_RAWTORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToRGB24Row = RAWToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + RAWToRGB24Row(src_raw, dst_rgb24, width); + src_raw += src_stride_raw; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// TODO(fbarchard): Consider uint8_t value +LIBYUV_API +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value) { + int y; + void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C; + + if (width <= 0 || height == 0) { + return; + } + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (dst_stride_y == width) { + width *= height; + height = 1; + dst_stride_y = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SetRow = SetRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_NEON; + } + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + SetRow = SetRow_Any_X86; + if (IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } + } +#endif +#if defined(HAS_SETROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + SetRow = SetRow_ERMS; + } +#endif +#if defined(HAS_SETROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { + SetRow = SetRow_MSA; + } +#endif +#if defined(HAS_SETROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SetRow = SetRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_LSX; + } + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + SetRow(dst_y, (uint8_t)value, width); + dst_y += dst_stride_y; + } +} + +// Draw a rectangle into I420 +LIBYUV_API +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8_t* start_y = dst_y + y * dst_stride_y + x; + uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + + if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || + y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + + SetPlane(start_y, dst_stride_y, width, height, value_y); + SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Draw a rectangle into ARGB +LIBYUV_API +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value) { + int y; + void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) = + ARGBSetRow_C; + if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + dst_argb += dst_y * dst_stride_argb + dst_x * 4; + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } + +#if defined(HAS_ARGBSETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSetRow = ARGBSetRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBSetRow = ARGBSetRow_X86; + } +#endif +#if defined(HAS_ARGBSETROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSetRow = ARGBSetRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_MSA; + } + } +#endif +#if defined(HAS_ARGBSETROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBSetRow = ARGBSetRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_LSX; + } + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + ARGBSetRow(dst_argb, value, width); + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert unattentuated ARGB to preattenuated ARGB. +// An unattenutated ARGB alpha blend uses the formula +// p = a * f + (1 - a) * b +// where +// p is output pixel +// f is foreground pixel +// b is background pixel +// a is alpha value from foreground pixel +// An preattenutated ARGB alpha blend uses the formula +// p = f + (1 - a) * b +// where +// f is foreground pixel premultiplied by alpha + +LIBYUV_API +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBAttenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBUnattenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBUNATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBUNATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; + } + } +#endif + // TODO(fbarchard): Neon version. + + for (y = 0; y < height; ++y) { + ARGBUnattenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB to Grayed ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif +#if defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif +#if defined(HAS_ARGBGRAYROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_LSX; + } +#endif +#if defined(HAS_ARGBGRAYROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBGrayRow = ARGBGrayRow_LASX; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif +#if defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif +#if defined(HAS_ARGBGRAYROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_LSX; + } +#endif +#if defined(HAS_ARGBGRAYROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBGrayRow = ARGBGrayRow_LASX; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(dst, dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBSepiaRow)(uint8_t* dst_argb, int width) = ARGBSepiaRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#endif +#if defined(HAS_ARGBSEPIAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_NEON; + } +#endif +#if defined(HAS_ARGBSEPIAROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_MSA; + } +#endif +#if defined(HAS_ARGBSEPIAROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_LSX; + } +#endif +#if defined(HAS_ARGBSEPIAROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBSepiaRow = ARGBSepiaRow_LASX; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. +LIBYUV_API +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height) { + int y; + void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const int8_t* matrix_argb, int width) = + ARGBColorMatrixRow_C; + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_LSX; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height) { + SIMD_ALIGNED(int8_t matrix_argb[16]); + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, + dst_stride_argb, &matrix_argb[0], width, height); +} + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb, + int width) = ARGBColorTableRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*RGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb, + int width) = RGBColorTableRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_RGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + RGBColorTableRow = RGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + RGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// ARGBQuantize is used to posterize art. +// e.g. rgb / qvalue * qvalue + qvalue / 2 +// But the low levels implement efficiently with 3 parameters, and could be +// used for other high level operations. +// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; +// where scale is 1 / interval_size as a fixed point value. +// The divide is replaces with a multiply by reciprocal fixed point multiply. +// Caveat - although SSE2 saturates, the C function does not and should be used +// with care if doing anything but quantization. +LIBYUV_API +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBQuantizeRow)(uint8_t* dst_argb, int scale, int interval_size, + int interval_offset, int width) = ARGBQuantizeRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || + interval_size < 1 || interval_size > 255) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBQUANTIZEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBQuantizeRow = ARGBQuantizeRow_SSE2; + } +#endif +#if defined(HAS_ARGBQUANTIZEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_NEON; + } +#endif +#if defined(HAS_ARGBQUANTIZEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_MSA; + } +#endif +#if defined(HAS_ARGBQUANTIZEROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_LSX; + } +#endif + for (y = 0; y < height; ++y) { + ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); + dst += dst_stride_argb; + } + return 0; +} + +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height) { + int y; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + int32_t* previous_cumsum = dst_cumsum; + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. + for (y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; + src_argb += src_stride_argb; + } + return 0; +} + +// Blur ARGB image. +// Caller should allocate CumulativeSum table of width * height * 16 bytes +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// as the buffer is treated as circular. +LIBYUV_API +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius) { + int y; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)( + const int32_t* topleft, const int32_t* botleft, int width, int area, + uint8_t* dst, int count) = CumulativeSumToAverageRow_C; + int32_t* cumsum_bot_row; + int32_t* max_cumsum_bot_row; + int32_t* cumsum_top_row; + + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (radius > height) { + radius = height; + } + if (radius > (width / 2 - 1)) { + radius = width / 2 - 1; + } + if (radius <= 0 || height <= 1) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; + } +#endif + // Compute enough CumulativeSum for first row to be blurred. After this + // one row of CumulativeSum is updated at a time. + ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, + dst_stride32_cumsum, width, radius); + + src_argb = src_argb + radius * src_stride_argb; + cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; + + max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; + cumsum_top_row = &dst_cumsum[0]; + + for (y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int area = radius * (bot_y - top_y); + int boxwidth = radius * 4; + int x; + int n; + + // Increment cumsum_top_row pointer with circular buffer wrap around. + if (top_y) { + cumsum_top_row += dst_stride32_cumsum; + if (cumsum_top_row >= max_cumsum_bot_row) { + cumsum_top_row = dst_cumsum; + } + } + // Increment cumsum_bot_row pointer with circular buffer wrap around and + // then fill in a row of CumulativeSum. + if ((y + radius) < height) { + const int32_t* prev_cumsum_bot_row = cumsum_bot_row; + cumsum_bot_row += dst_stride32_cumsum; + if (cumsum_bot_row >= max_cumsum_bot_row) { + cumsum_bot_row = dst_cumsum; + } + ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, + width); + src_argb += src_stride_argb; + } + + // Left clipped. + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + n = (width - 1) - radius - x + 1; + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, boxwidth, + area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply ARGB image by a specified ARGB value. +LIBYUV_API +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value) { + int y; + void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, + uint32_t value) = ARGBShadeRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHADEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_SSE2; + } +#endif +#if defined(HAS_ARGBSHADEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_NEON; + } +#endif +#if defined(HAS_ARGBSHADEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_MSA; + } +#endif +#if defined(HAS_ARGBSHADEROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_LSX; + } +#endif +#if defined(HAS_ARGBSHADEROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_LASX; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShadeRow(src_argb, dst_argb, width, value); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Interpolate 2 planes by specified amount (0 to 255). +LIBYUV_API +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation) { + int y; + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src0 || !src1 || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst = dst + (height - 1) * dst_stride; + dst_stride = -dst_stride; + } + // Coalesce rows. + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { + width *= height; + height = 1; + src_stride0 = src_stride1 = dst_stride = 0; + } +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow(dst, src0, src1 - src0, width, interpolation); + src0 += src_stride0; + src1 += src_stride1; + dst += dst_stride; + } + return 0; +} + +// Interpolate 2 planes by specified amount (0 to 255). +LIBYUV_API +int InterpolatePlane_16(const uint16_t* src0, + int src_stride0, + const uint16_t* src1, + int src_stride1, + uint16_t* dst, + int dst_stride, + int width, + int height, + int interpolation) { + int y; + void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + if (!src0 || !src1 || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst = dst + (height - 1) * dst_stride; + dst_stride = -dst_stride; + } + // Coalesce rows. + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { + width *= height; + height = 1; + src_stride0 = src_stride1 = dst_stride = 0; + } +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow_16 = InterpolateRow_16_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow_16 = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow_16 = InterpolateRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow_16 = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow_16 = InterpolateRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + InterpolateRow_16 = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow_16 = InterpolateRow_16_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow_16 = InterpolateRow_16_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow_16 = InterpolateRow_16_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow_16 = InterpolateRow_16_LSX; + } + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow_16(dst, src0, src1 - src0, width, interpolation); + src0 += src_stride0; + src1 += src_stride1; + dst += dst_stride; + } + return 0; +} + +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation) { + return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, + src_stride_argb1, dst_argb, dst_stride_argb, + width * 4, height, interpolation); +} + +// Interpolate 2 YUV images by specified amount (0 to 255). +LIBYUV_API +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || + !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + + InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, + dst_stride_y, width, height, interpolation); + InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, + dst_stride_u, halfwidth, halfheight, interpolation); + InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, + dst_stride_v, halfwidth, halfheight, interpolation); + return 0; +} + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +LIBYUV_API +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height) { + int y; + void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, + const uint8_t* shuffler, int width) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + // Coalesce rows. + if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_bgra = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_MSA; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBShuffleRow = ARGBShuffleRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_LSX; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBShuffleRow = ARGBShuffleRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle AR64 channel order. e.g. AR64 to AB64. +LIBYUV_API +int AR64Shuffle(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ar64, + int dst_stride_ar64, + const uint8_t* shuffler, + int width, + int height) { + int y; + void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64, + const uint8_t* shuffler, int width) = AR64ShuffleRow_C; + if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_ar64 = 0; + } + // Assembly versions can be reused if it's implemented with shuffle. +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + AR64ShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + AR64ShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + AR64ShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler, + width * 2); + src_ar64 += src_stride_ar64; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Gauss blur a float plane using Gaussian 5x5 filter with +// coefficients of 1, 4, 6, 4, 1. +// Each destination pixel is a blur of the 5x5 +// pixels from the source. +// Source edges are clamped. +// Edge is 2 pixels on each side, and interior is multiple of 4. +LIBYUV_API +int GaussPlane_F32(const float* src, + int src_stride, + float* dst, + int dst_stride, + int width, + int height) { + int y; + void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2, + const float* src3, const float* src4, float* dst, + int width) = GaussCol_F32_C; + void (*GaussRow_F32)(const float* src, float* dst, int width) = + GaussRow_F32_C; + if (!src || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + +#if defined(HAS_GAUSSCOL_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussCol_F32 = GaussCol_F32_NEON; + } +#endif +#if defined(HAS_GAUSSROW_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussRow_F32 = GaussRow_F32_NEON; + } +#endif + { + // 2 pixels on each side, but aligned out to 16 bytes. + align_buffer_64(rowbuf, (4 + width + 4) * 4); + memset(rowbuf, 0, 16); + memset(rowbuf + (4 + width) * 4, 0, 16); + float* row = (float*)(rowbuf + 16); + const float* src0 = src; + const float* src1 = src; + const float* src2 = src; + const float* src3 = src2 + ((height > 1) ? src_stride : 0); + const float* src4 = src3 + ((height > 2) ? src_stride : 0); + + for (y = 0; y < height; ++y) { + GaussCol_F32(src0, src1, src2, src3, src4, row, width); + + // Extrude edge by 2 floats + row[-2] = row[-1] = row[0]; + row[width + 1] = row[width] = row[width - 1]; + + GaussRow_F32(row - 2, dst, width); + + src0 = src1; + src1 = src2; + src2 = src3; + src3 = src4; + if ((y + 2) < (height - 1)) { + src4 += src_stride; + } + dst += dst_stride; + } + free_aligned_buffer_64(rowbuf); + } + return 0; +} + +// Sobel ARGB effect. +static int ARGBSobelize(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + void (*SobelRow)(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst, + int width)) { + int y; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = + ARGBToYJRow_C; + void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, + uint8_t* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, + const uint8_t* src_y2, uint8_t* dst_sobely, int width) = + SobelXRow_C; + const int kEdge = 16; // Extra pixels at start of row for extrude/align. + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYJRow = ARGBToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif + +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; + } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } +#endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; + } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } +#endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } +#endif + { + // 3 rows with edges before/after. + const int row_size = (width + kEdge + 31) & ~31; + align_buffer_64(rows, row_size * 2 + (kEdge + row_size * 3 + kEdge)); + uint8_t* row_sobelx = rows; + uint8_t* row_sobely = rows + row_size; + uint8_t* row_y = rows + row_size * 2; + + // Convert first row. + uint8_t* row_y0 = row_y + kEdge; + uint8_t* row_y1 = row_y0 + row_size; + uint8_t* row_y2 = row_y1 + row_size; + ARGBToYJRow(src_argb, row_y0, width); + row_y0[-1] = row_y0[0]; + memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. + ARGBToYJRow(src_argb, row_y1, width); + row_y1[-1] = row_y1[0]; + memset(row_y1 + width, row_y1[width - 1], 16); + memset(row_y2 + width, 0, 16); + + for (y = 0; y < height; ++y) { + // Convert next row of ARGB to G. + if (y < (height - 1)) { + src_argb += src_stride_argb; + } + ARGBToYJRow(src_argb, row_y2, width); + row_y2[-1] = row_y2[0]; + row_y2[width] = row_y2[width - 1]; + + SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); + SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); + SobelRow(row_sobelx, row_sobely, dst_argb, width); + + // Cycle thru circular queue of 3 row_y buffers. + { + uint8_t* row_yt = row_y0; + row_y0 = row_y1; + row_y1 = row_y2; + row_y2 = row_yt; + } + + dst_argb += dst_stride_argb; + } + free_aligned_buffer_64(rows); + } + return 0; +} + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelRow_C; +#if defined(HAS_SOBELROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelRow = SobelRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelRow = SobelRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } + } +#endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } +#endif +#if defined(HAS_SOBELROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SobelRow = SobelRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_LSX; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelRow); +} + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelToPlaneRow = SobelToPlaneRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } + } +#endif +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } +#endif +#if defined(HAS_SOBELTOPLANEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SobelToPlaneRow = SobelToPlaneRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_LSX; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, + height, SobelToPlaneRow); +} + +// SobelXY ARGB effect. +// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. +LIBYUV_API +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelXYRow_C; +#if defined(HAS_SOBELXYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXYRow = SobelXYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXYRow = SobelXYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } + } +#endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } +#endif +#if defined(HAS_SOBELXYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SobelXYRow = SobelXYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_LSX; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelXYRow); +} + +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const float* poly, + int width, + int height) { + int y; + void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const float* poly, int width) = ARGBPolynomialRow_C; + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; + } +#endif +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height) { + int y; + void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, + int width) = HalfFloatRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + src_stride_y >>= 1; + dst_stride_y >>= 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_HALFFLOATROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + HalfFloatRow = HalfFloatRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = HalfFloatRow_SSE2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HalfFloatRow = HalfFloatRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_AVX2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_F16C) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; + } + } +#endif +#if defined(HAS_HALFFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; + } + } +#endif +#if defined(HAS_HALFFLOATROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HalfFloatRow = HalfFloatRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_MSA; + } + } +#endif +#if defined(HAS_HALFFLOATROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + HalfFloatRow = HalfFloatRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_LSX; + } + } +#endif + + for (y = 0; y < height; ++y) { + HalfFloatRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + return 0; +} + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { + void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, + int width) = ByteToFloatRow_C; + if (!src_y || !dst_y || width <= 0) { + return -1; + } +#if defined(HAS_BYTETOFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ByteToFloatRow = ByteToFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ByteToFloatRow = ByteToFloatRow_NEON; + } + } +#endif + + ByteToFloatRow(src_y, dst_y, scale, width); + return 0; +} + +// Apply a lumacolortable to each ARGB pixel. +LIBYUV_API +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height) { + int y; + void (*ARGBLumaColorTableRow)( + const uint8_t* src_argb, uint8_t* dst_argb, int width, + const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy Alpha from one ARGB image to another. +LIBYUV_API +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBCopyAlphaRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyAlphaRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Extract just the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + if (!src_argb || !dst_a || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb += (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_a == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_a = 0; + } + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; +#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 + : ARGBExtractAlphaRow_Any_SSE2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON + : ARGBExtractAlphaRow_Any_NEON; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX + : ARGBExtractAlphaRow_Any_LSX; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBExtractAlphaRow(src_argb, dst_a, width); + src_argb += src_stride_argb; + dst_a += dst_stride_a; + } + return 0; +} + +// Copy a planar Y channel to the alpha channel of a destination ARGB image. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, + int width) = ARGBCopyYToAlphaRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyYToAlphaRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +LIBYUV_API +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2, + uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C; + if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + } + } +#endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + } + } +#endif +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + } + } +#endif + +#if defined(HAS_YUY2TONVUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TONVUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TONVUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width); + } + return 0; +} + +LIBYUV_API +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, + int width) = SplitUVRow_C; + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + + if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_SPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SplitUVRow = SplitUVRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_LSX; + } + } +#endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + + { + int awidth = halfwidth * 2; + // row of y and 2 rows of uv + align_buffer_64(rows, awidth * 3); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_uyvy, rows + awidth, rows, awidth); + memcpy(dst_y, rows, width); + SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth); + memcpy(dst_y + dst_stride_y, rows, width); + InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_uyvy, dst_uv, rows, awidth); + memcpy(dst_y, rows, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + +// width and height are src size allowing odd size handling. +LIBYUV_API +void HalfMergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + uint8_t* dst_uv, int width) = HalfMergeUVRow_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_HALFMERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_NEON; + } +#endif +#if defined(HAS_HALFMERGEUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_SSSE3; + } +#endif +#if defined(HAS_HALFMERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + HalfMergeUVRow = HalfMergeUVRow_AVX2; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + // Merge a row of U and V into a row of UV. + HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); + src_u += src_stride_u * 2; + src_v += src_stride_v * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width); + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate.cc b/source/rotate.cc new file mode 100644 index 00000000..8d3978c7 --- /dev/null +++ b/source/rotate.cc @@ -0,0 +1,1196 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/convert.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + int i = height; +#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) + void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx16_C; +#else + void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx8_C; +#endif + +#if defined(HAS_TRANSPOSEWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeWx8 = TransposeWx8_Any_NEON; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_NEON; + } + } +#endif +#if defined(HAS_TRANSPOSEWX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + TransposeWx8 = TransposeWx8_Fast_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeWx16 = TransposeWx16_Any_MSA; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_MSA; + } + } +#endif +#if defined(HAS_TRANSPOSEWX16_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + TransposeWx16 = TransposeWx16_Any_LSX; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_LSX; + } + } +#endif + +#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) + // Work across the source in 16x16 tiles + while (i >= 16) { + TransposeWx16(src, src_stride, dst, dst_stride, width); + src += 16 * src_stride; // Go down 16 rows. + dst += 16; // Move over 16 columns. + i -= 16; + } +#else + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } +#endif + + if (i > 0) { + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); + } +} + +LIBYUV_API +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + // Swap top and bottom row and mirror the content. Uses a temporary row. + align_buffer_64(row, width); + const uint8_t* src_bot = src + src_stride * (height - 1); + uint8_t* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif +#if defined(HAS_MIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorRow = MirrorRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_LSX; + } + } +#endif +#if defined(HAS_MIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorRow = MirrorRow_Any_LASX; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_LASX; + } + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + CopyRow(src, row, width); // Copy top row into buffer + MirrorRow(src_bot, dst, width); // Mirror bottom row into top row + MirrorRow(row, dst_bot, width); // Mirror buffer into bottom row + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +void SplitTransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int i = height; +#if defined(HAS_TRANSPOSEUVWX16_MSA) + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; +#elif defined(HAS_TRANSPOSEUVWX16_LSX) + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; +#else + void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx8_C; +#endif + +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } + } +#elif defined(HAS_TRANSPOSEUVWX16_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + TransposeUVWx16 = TransposeUVWx16_Any_LSX; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_LSX; + } + } +#else +#if defined(HAS_TRANSPOSEUVWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeUVWx8 = TransposeUVWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSEUVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + TransposeUVWx8 = TransposeUVWx8_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; + } + } +#endif +#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */ + +#if defined(HAS_TRANSPOSEUVWX16_MSA) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } +#elif defined(HAS_TRANSPOSEUVWX16_LSX) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } +#else + // Work through the source in 8x8 tiles. + while (i >= 8) { + TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. + i -= 8; + } +#endif + + if (i > 0) { + TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, i); + } +} + +LIBYUV_API +void SplitRotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + src += src_stride * (height - 1); + src_stride = -src_stride; + + SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, height); +} + +LIBYUV_API +void SplitRotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + dst_a += dst_stride_a * (width - 1); + dst_b += dst_stride_b * (width - 1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; + + SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, height); +} + +// Rotate 180 is a horizontal and vertical flip. +LIBYUV_API +void SplitRotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int i; + void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorSplitUVRow_C; +#if defined(HAS_MIRRORSPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorSplitUVRow = MirrorSplitUVRow_NEON; + } +#endif +#if defined(HAS_MIRRORSPLITUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORSPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorSplitUVRow = MirrorSplitUVRow_MSA; + } +#endif +#if defined(HAS_MIRRORSPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) { + MirrorSplitUVRow = MirrorSplitUVRow_LSX; + } +#endif + + dst_a += dst_stride_a * (height - 1); + dst_b += dst_stride_b * (height - 1); + + for (i = 0; i < height; ++i) { + MirrorSplitUVRow(src, dst_a, dst_b, width); + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; + } +} + +// Rotate UV and split into planar. +// width and height expected to be half size for NV12 +LIBYUV_API +int SplitRotateUV(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + case kRotate90: + SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + case kRotate270: + SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + case kRotate180: + SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate90: + RotatePlane90(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate270: + RotatePlane270(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate180: + RotatePlane180(src, src_stride, dst, dst_stride, width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +void TransposePlane_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + int i = height; + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8_16_C(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } + + if (i > 0) { + TransposeWxH_16_C(src, src_stride, dst, dst_stride, width, i); + } +} + +static void RotatePlane90_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane_16(src, src_stride, dst, dst_stride, width, height); +} + +static void RotatePlane270_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane_16(src, src_stride, dst, dst_stride, width, height); +} + +static void RotatePlane180_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + // Swap top and bottom row and mirror the content. Uses a temporary row. + align_buffer_64_16(row, width); + const uint16_t* src_bot = src + src_stride * (height - 1); + uint16_t* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + CopyRow_16_C(src, row, width); // Copy top row into buffer + MirrorRow_16_C(src_bot, dst, width); // Mirror bottom row into top row + MirrorRow_16_C(row, dst_bot, width); // Mirror buffer into bottom row + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64_16(row); +} + +LIBYUV_API +int RotatePlane_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane_16(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate90: + RotatePlane90_16(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate270: + RotatePlane270_16(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate180: + RotatePlane180_16(src, src_stride, dst, dst_stride, width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + default: + break; + } + return -1; +} + +// I422 has half width x full height UV planes, so rotate by 90 and 270 +// require scaling to maintain 422 subsampling. +LIBYUV_API +int I422Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // Copy frame + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; + + // Note on temporary Y plane for UV. + // Rotation of UV first fits within the Y destination plane rows. + // Y plane is width x height + // Y plane rotated is height x width + // UV plane is (width / 2) x height + // UV plane rotated is height x (width / 2) + // UV plane rotated+scaled is (height / 2) x width. + // UV plane rotated is a temporary that fits within the Y plane rotated. + + case kRotate90: + RotatePlane90(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, + halfheight, width, kFilterBilinear); + RotatePlane90(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, + halfheight, width, kFilterLinear); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; + case kRotate270: + RotatePlane270(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, + halfheight, width, kFilterBilinear); + RotatePlane270(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, + halfheight, width, kFilterLinear); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + height); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I444Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || + !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + // copy frame + return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +static void SplitPixels(const uint8_t* src_u, + int src_pixel_stride_uv, + uint8_t* dst_u, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst_u = *src_u; + ++dst_u; + src_u += src_pixel_stride_uv; + } +} + +// Convert Android420 to I420 with Rotate +LIBYUV_API +int Android420ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode rotation) { + int y; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + RotatePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + rotation); + } + + // Copy UV planes - I420 + if (src_pixel_stride_uv == 1) { + RotatePlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight, + rotation); + RotatePlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight, + rotation); + return 0; + } + // Split UV planes - NV21 + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + SplitRotateUV(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, + halfwidth, halfheight, rotation); + return 0; + } + // Split UV planes - NV12 + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + SplitRotateUV(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight, rotation); + return 0; + } + + if (rotation == 0) { + for (y = 0; y < halfheight; ++y) { + SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); + SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; + } + // unsupported type and/or rotation. + return -1; +} + +LIBYUV_API +int I010Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v || dst_stride_y < 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I010Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); + case kRotate90: + RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate270: + RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate180: + RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + default: + break; + } + return -1; +} + +// I210 has half width x full height UV planes, so rotate by 90 and 270 +// require scaling to maintain 422 subsampling. +LIBYUV_API +int I210Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // Copy frame + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; + + // Note on temporary Y plane for UV. + // Rotation of UV first fits within the Y destination plane rows. + // Y plane is width x height + // Y plane rotated is height x width + // UV plane is (width / 2) x height + // UV plane rotated is height x (width / 2) + // UV plane rotated+scaled is (height / 2) x width. + // UV plane rotated is a temporary that fits within the Y plane rotated. + + case kRotate90: + RotatePlane90_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, + halfheight, width, kFilterBilinear); + RotatePlane90_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, + halfheight, width, kFilterLinear); + RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; + case kRotate270: + RotatePlane270_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, + halfheight, width, kFilterBilinear); + RotatePlane270_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, + halfheight, width, kFilterLinear); + RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + return 0; + case kRotate180: + RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + height); + RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I410Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v || dst_stride_y < 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate90: + RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate270: + RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, width, + height); + RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, width, + height); + return 0; + case kRotate180: + RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, width, + height); + RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, width, + height); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_any.cc b/source/rotate_any.cc new file mode 100644 index 00000000..88ca7876 --- /dev/null +++ b/source/rotate_any.cc @@ -0,0 +1,79 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ + int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } + +#ifdef HAS_TRANSPOSEWX8_NEON +TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_SSSE3 +TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 +TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) +#endif +#ifdef HAS_TRANSPOSEWX16_MSA +TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) +#endif +#ifdef HAS_TRANSPOSEWX16_LSX +TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, 15) +#endif +#undef TANY + +#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ + int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ + dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ + } + +#ifdef HAS_TRANSPOSEUVWX8_NEON +TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX8_SSE2 +TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX16_MSA +TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX16_LSX +TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7) +#endif +#undef TUVANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc new file mode 100644 index 00000000..c7239010 --- /dev/null +++ b/source/rotate_argb.cc @@ -0,0 +1,257 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_argb.h" + +#include "libyuv/convert.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static int ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int i; + int src_pixel_step = src_stride_argb >> 2; + void (*ScaleARGBRowDownEven)( + const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, + uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; + // Check stride is a multiple of 4. + if (src_stride_argb & 3) { + return -1; + } +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX; + } + } +#endif + + for (i = 0; i < width; ++i) { // column of source to row of dest. + ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); + dst_argb += dst_stride_argb; + src_argb += 4; + } + return 0; +} + +static int ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Rotate by 90 is a ARGBTranspose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src_argb += src_stride_argb * (height - 1); + src_stride_argb = -src_stride_argb; + return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); +} + +static int ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Rotate by 270 is a ARGBTranspose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst_argb += dst_stride_argb * (width - 1); + dst_stride_argb = -dst_stride_argb; + return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); +} + +static int ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width * 4); + const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); + uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBMirrorRow_C; + void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + CopyRow_C; +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_LSX; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_LASX; + } + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + src_bot -= src_stride_argb; + dst_bot -= dst_stride_argb; + } + free_aligned_buffer_64(row); + return 0; +} + +LIBYUV_API +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + enum RotationMode mode) { + if (!src_argb || width <= 0 || height == 0 || !dst_argb) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + switch (mode) { + case kRotate0: + // copy frame + return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); + case kRotate90: + return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); + case kRotate270: + return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); + case kRotate180: + return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_common.cc b/source/rotate_common.cc new file mode 100644 index 00000000..4b496d1b --- /dev/null +++ b/source/rotate_common.cc @@ -0,0 +1,229 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int i; + for (i = 0; i < width * 2; i += 2) { + int j; + for (j = 0; j < height; ++j) { + dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)]; + dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1]; + } + } +} + +void TransposeWx8_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst_a, + int dst_stride_a, + uint16_t* dst_b, + int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + int i; + for (i = 0; i < width; i += 4) { + uint32_t p00 = ((uint32_t*)(src))[0]; + uint32_t p10 = ((uint32_t*)(src))[1]; + uint32_t p20 = ((uint32_t*)(src))[2]; + uint32_t p30 = ((uint32_t*)(src))[3]; + uint32_t p01 = ((uint32_t*)(src1))[0]; + uint32_t p11 = ((uint32_t*)(src1))[1]; + uint32_t p21 = ((uint32_t*)(src1))[2]; + uint32_t p31 = ((uint32_t*)(src1))[3]; + uint32_t p02 = ((uint32_t*)(src2))[0]; + uint32_t p12 = ((uint32_t*)(src2))[1]; + uint32_t p22 = ((uint32_t*)(src2))[2]; + uint32_t p32 = ((uint32_t*)(src2))[3]; + uint32_t p03 = ((uint32_t*)(src3))[0]; + uint32_t p13 = ((uint32_t*)(src3))[1]; + uint32_t p23 = ((uint32_t*)(src3))[2]; + uint32_t p33 = ((uint32_t*)(src3))[3]; + ((uint32_t*)(dst))[0] = p00; + ((uint32_t*)(dst))[1] = p01; + ((uint32_t*)(dst))[2] = p02; + ((uint32_t*)(dst))[3] = p03; + ((uint32_t*)(dst1))[0] = p10; + ((uint32_t*)(dst1))[1] = p11; + ((uint32_t*)(dst1))[2] = p12; + ((uint32_t*)(dst1))[3] = p13; + ((uint32_t*)(dst2))[0] = p20; + ((uint32_t*)(dst2))[1] = p21; + ((uint32_t*)(dst2))[2] = p22; + ((uint32_t*)(dst2))[3] = p23; + ((uint32_t*)(dst3))[0] = p30; + ((uint32_t*)(dst3))[1] = p31; + ((uint32_t*)(dst3))[2] = p32; + ((uint32_t*)(dst3))[3] = p33; + src += src_stride * 4; // advance 4 rows + src1 += src_stride * 4; + src2 += src_stride * 4; + src3 += src_stride * 4; + dst += 4 * 4; // advance 4 columns + dst1 += 4 * 4; + dst2 += 4 * 4; + dst3 += 4 * 4; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc new file mode 100644 index 00000000..fd5eee05 --- /dev/null +++ b/source/rotate_gcc.cc @@ -0,0 +1,503 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. +#if defined(HAS_TRANSPOSEWX8_SSSE3) +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSEWX8_SSSE3) + +// Transpose 16x8. 64 bit +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15"); +} +#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + +// Transpose UV 8x8. 64 bit. +#if defined(HAS_TRANSPOSEUVWX8_SSE2) +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9"); +} +#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) + +#if defined(HAS_TRANSPOSE4X4_32_SSE2) +// 4 values, little endian view +// a b c d +// e f g h +// i j k l +// m n o p + +// transpose 2x2 +// a e b f from row 0, 1 +// i m j n from row 2, 3 +// c g d h from row 0, 1 +// k o l p from row 2, 3 + +// transpose 4x4 +// a e i m from row 0, 1 +// b f j n from row 0, 1 +// c g k o from row 2, 3 +// d h l p from row 2, 3 + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "movdqu (%0),%%xmm0 \n" // a b c d + "movdqu (%0,%3),%%xmm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "movdqu (%0),%%xmm2 \n" // i j k l + "movdqu (%0,%3),%%xmm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + // Transpose 2x2 + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "movdqa %%xmm0,%%xmm6 \n" + "movdqa %%xmm2,%%xmm7 \n" + "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1 + "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3 + "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1 + "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3 + + // Transpose 4x4 + "movdqa %%xmm4,%%xmm0 \n" + "movdqa %%xmm4,%%xmm1 \n" + "movdqa %%xmm6,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1 + "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1 + "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3 + "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3 + + "movdqu %%xmm0,(%1) \n" + "lea 16(%1,%4),%1 \n" // dst += stride + 16 + "movdqu %%xmm1,-16(%1) \n" + "movdqu %%xmm2,-16(%1,%4) \n" + "movdqu %%xmm3,-16(%1,%4,2) \n" + "sub %4,%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+rm"(width) // %2 + : "r"((ptrdiff_t)(src_stride)), // %3 + "r"((ptrdiff_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSE4X4_32_SSE2) + +#if defined(HAS_TRANSPOSE4X4_32_AVX2) + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_AVX2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Main loop transpose 2 blocks of 4x4. Read a column, write a row. + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // a b c d + "vmovdqu (%0,%3),%%xmm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "vmovdqu (%0),%%xmm2 \n" // i j k l + "vmovdqu (%0,%3),%%xmm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d + "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l + "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + // Transpose 2x2 + "vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1 + "vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3 + "vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1 + "vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3 + + // Transpose 4x4 + "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1 + "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1 + "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3 + "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3 + + "vmovdqu %%ymm0,(%1) \n" + "lea 32(%1,%4),%1 \n" // dst += stride + 32 + "vmovdqu %%ymm1,-32(%1) \n" + "vmovdqu %%ymm2,-32(%1,%4) \n" + "vmovdqu %%ymm3,-32(%1,%4,2) \n" + "sub %4,%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+rm"(width) // %2 + : "r"((ptrdiff_t)(src_stride)), // %3 + "r"((ptrdiff_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSE4X4_32_AVX2) + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_lsx.cc b/source/rotate_lsx.cc new file mode 100644 index 00000000..94a2b91c --- /dev/null +++ b/source/rotate_lsx.cc @@ -0,0 +1,243 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \ + } + +#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \ + } + +#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \ + } + +#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \ + } + +#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \ + _stride3, _stride4) \ + { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + __lsx_vstx(_dst2, _dst, _stride2); \ + __lsx_vstx(_dst3, _dst, _stride3); \ + _dst += _stride4; \ + } + +#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \ + { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + _dst += _stride2; \ + } + +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_LSX(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int x; + int len = width / 16; + uint8_t* s; + int src_stride2 = src_stride << 1; + int src_stride3 = src_stride + src_stride2; + int src_stride4 = src_stride2 << 1; + int dst_stride2 = dst_stride << 1; + int dst_stride3 = dst_stride + dst_stride2; + int dst_stride4 = dst_stride2 << 1; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < len; x++) { + s = (uint8_t*)src; + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + res8 = __lsx_vilvl_w(reg4, reg0); + res9 = __lsx_vilvh_w(reg4, reg0); + ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + res8 = __lsx_vilvl_w(reg5, reg1); + res9 = __lsx_vilvh_w(reg5, reg1); + ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + res8 = __lsx_vilvl_w(reg6, reg2); + res9 = __lsx_vilvh_w(reg6, reg2); + ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + res8 = __lsx_vilvl_w(reg7, reg3); + res9 = __lsx_vilvh_w(reg7, reg3); + ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + src += 16; + } +} + +void TransposeUVWx16_LSX(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int x; + int len = width / 8; + uint8_t* s; + int src_stride2 = src_stride << 1; + int src_stride3 = src_stride + src_stride2; + int src_stride4 = src_stride2 << 1; + int dst_stride_a2 = dst_stride_a << 1; + int dst_stride_b2 = dst_stride_b << 1; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < len; x++) { + s = (uint8_t*)src; + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + res8 = __lsx_vilvl_w(reg4, reg0); + res9 = __lsx_vilvh_w(reg4, reg0); + ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + res8 = __lsx_vilvl_w(reg5, reg1); + res9 = __lsx_vilvh_w(reg5, reg1); + ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + res8 = __lsx_vilvl_w(reg6, reg2); + res9 = __lsx_vilvh_w(reg6, reg2); + ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + res8 = __lsx_vilvl_w(reg7, reg3); + res9 = __lsx_vilvh_w(reg7, reg3); + ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + src += 16; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/source/rotate_msa.cc b/source/rotate_msa.cc new file mode 100644 index 00000000..99bdca65 --- /dev/null +++ b/source/rotate_msa.cc @@ -0,0 +1,250 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ + out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ + out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ + out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ + } + +#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ + out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ + out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ + out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ + } + +#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ + out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ + out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ + out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ + } + +#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ + out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ + } + +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 16) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + src += 16; + dst += dst_stride * 4; + } +} + +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 8) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + src += 16; + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc new file mode 100644 index 00000000..569a7318 --- /dev/null +++ b/source/rotate_neon.cc @@ -0,0 +1,458 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "vld1.8 {d0}, [%0], %2 \n" + "vld1.8 {d1}, [%0], %2 \n" + "vld1.8 {d2}, [%0], %2 \n" + "vld1.8 {d3}, [%0], %2 \n" + "vld1.8 {d4}, [%0], %2 \n" + "vld1.8 {d5}, [%0], %2 \n" + "vld1.8 {d6}, [%0], %2 \n" + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + "vst1.8 {d1}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d3}, [%0], %4 \n" + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d5}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d7}, [%0], %4 \n" + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + "vld1.32 {d0[0]}, [%0], %2 \n" + "vld1.32 {d0[1]}, [%0], %2 \n" + "vld1.32 {d1[0]}, [%0], %2 \n" + "vld1.32 {d1[1]}, [%0], %2 \n" + "vld1.32 {d2[0]}, [%0], %2 \n" + "vld1.32 {d2[1]}, [%0], %2 \n" + "vld1.32 {d3[0]}, [%0], %2 \n" + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "vst1.32 {d4[0]}, [%0], %4 \n" + "vst1.32 {d4[1]}, [%0], %4 \n" + "vst1.32 {d5[0]}, [%0], %4 \n" + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + "vst1.32 {d0[0]}, [%0], %4 \n" + "vst1.32 {d0[1]}, [%0], %4 \n" + "vst1.32 {d1[0]}, [%0], %4 \n" + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld1.16 {d0[0]}, [%0], %2 \n" + "vld1.16 {d1[0]}, [%0], %2 \n" + "vld1.16 {d0[1]}, [%0], %2 \n" + "vld1.16 {d1[1]}, [%0], %2 \n" + "vld1.16 {d0[2]}, [%0], %2 \n" + "vld1.16 {d1[2]}, [%0], %2 \n" + "vld1.16 {d0[3]}, [%0], %2 \n" + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld1.8 {d0[0]}, [%1], %2 \n" + "vld1.8 {d0[1]}, [%1], %2 \n" + "vld1.8 {d0[2]}, [%1], %2 \n" + "vld1.8 {d0[3]}, [%1], %2 \n" + "vld1.8 {d0[4]}, [%1], %2 \n" + "vld1.8 {d0[5]}, [%1], %2 \n" + "vld1.8 {d0[6]}, [%1], %2 \n" + "vld1.8 {d0[7]}, [%1] \n" + + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; + +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "vld2.8 {d0, d1}, [%0], %2 \n" + "vld2.8 {d2, d3}, [%0], %2 \n" + "vld2.8 {d4, d5}, [%0], %2 \n" + "vld2.8 {d6, d7}, [%0], %2 \n" + "vld2.8 {d16, d17}, [%0], %2 \n" + "vld2.8 {d18, d19}, [%0], %2 \n" + "vld2.8 {d20, d21}, [%0], %2 \n" + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d6}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d18}, [%0], %4 \n" + "vst1.8 {d16}, [%0], %4 \n" + "vst1.8 {d22}, [%0], %4 \n" + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.8 {d3}, [%0], %6 \n" + "vst1.8 {d1}, [%0], %6 \n" + "vst1.8 {d7}, [%0], %6 \n" + "vst1.8 {d5}, [%0], %6 \n" + "vst1.8 {d19}, [%0], %6 \n" + "vst1.8 {d17}, [%0], %6 \n" + "vst1.8 {d23}, [%0], %6 \n" + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "vld1.64 {d0}, [%0], %2 \n" + "vld1.64 {d1}, [%0], %2 \n" + "vld1.64 {d2}, [%0], %2 \n" + "vld1.64 {d3}, [%0], %2 \n" + "vld1.64 {d4}, [%0], %2 \n" + "vld1.64 {d5}, [%0], %2 \n" + "vld1.64 {d6}, [%0], %2 \n" + "vld1.64 {d7}, [%0] \n" + + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + "vst1.32 {d16[0]}, [%0], %4 \n" + "vst1.32 {d16[1]}, [%0], %4 \n" + "vst1.32 {d17[0]}, [%0], %4 \n" + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + "vst1.32 {d20[0]}, [%0], %4 \n" + "vst1.32 {d20[1]}, [%0], %4 \n" + "vst1.32 {d21[0]}, [%0], %4 \n" + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.32 {d18[0]}, [%0], %6 \n" + "vst1.32 {d18[1]}, [%0], %6 \n" + "vst1.32 {d19[0]}, [%0], %6 \n" + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + "vst1.32 {d22[0]}, [%0], %6 \n" + "vst1.32 {d22[1]}, [%0], %6 \n" + "vst1.32 {d23[0]}, [%0], %6 \n" + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.64 {d1}, [%0], %6 \n" + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + "vst1.64 {d0}, [%3] \n" + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); +} + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" + "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n" + "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n" + "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n" + "subs %8, %8, #4 \n" // w -= 4 + "vst1.8 {q0}, [%4]! \n" + "vst1.8 {q1}, [%5]! \n" + "vst1.8 {q2}, [%6]! \n" + "vst1.8 {q3}, [%7]! \n" + "bgt 1b \n" + + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(dst1), // %5 + "+r"(dst2), // %6 + "+r"(dst3), // %7 + "+r"(width) // %8 + : "r"((ptrdiff_t)(src_stride * 4)) // %9 + : "memory", "cc", "q0", "q1", "q2", "q3"); +} + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc new file mode 100644 index 00000000..95047fa7 --- /dev/null +++ b/source/rotate_neon64.cc @@ -0,0 +1,482 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w3, %w3, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" + "mov %0, %1 \n" + + "trn2 v16.8b, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "trn1 v17.8b, v0.8b, v1.8b \n" + "add %0, %0, %5 \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 1 + "trn1 v19.8b, v2.8b, v3.8b \n" + "add %0, %0, %5 \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 2 + "trn1 v21.8b, v4.8b, v5.8b \n" + "add %0, %0, %5 \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 3 + "trn1 v23.8b, v6.8b, v7.8b \n" + "add %0, %0, %5 \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 4 + "trn1 v1.4h, v17.4h, v19.4h \n" + "add %0, %0, %5 \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 5 + "trn1 v0.4h, v16.4h, v18.4h \n" + "add %0, %0, %5 \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 6 + "trn1 v5.4h, v21.4h, v23.4h \n" + "add %0, %0, %5 \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 7 + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" + + "mov %0, %2 \n" + + "st1 {v17.8b}, [%0], %6 \n" + "st1 {v16.8b}, [%0], %6 \n" + "st1 {v19.8b}, [%0], %6 \n" + "st1 {v18.8b}, [%0], %6 \n" + "st1 {v21.8b}, [%0], %6 \n" + "st1 {v20.8b}, [%0], %6 \n" + "st1 {v23.8b}, [%0], %6 \n" + "st1 {v22.8b}, [%0] \n" + + "add %1, %1, #8 \n" // src += 8 + "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride + "subs %w3, %w3, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %w3, #2 \n" + "b.lt 3f \n" + + "cmp %w3, #4 \n" + "b.lt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" + + "mov %0, %2 \n" + + "ld1 {v2.16b}, [%4] \n" + + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "st1 {v3.s}[0], [%0], %6 \n" + "st1 {v3.s}[1], [%0], %6 \n" + "st1 {v3.s}[2], [%0], %6 \n" + "st1 {v3.s}[3], [%0] \n" + + "add %0, %2, #4 \n" + "st1 {v0.s}[0], [%0], %6 \n" + "st1 {v0.s}[1], [%0], %6 \n" + "st1 {v0.s}[2], [%0], %6 \n" + "st1 {v0.s}[3], [%0] \n" + + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %w3, %w3, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w3, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld1 {v0.h}[0], [%0], %5 \n" + "ld1 {v1.h}[0], [%0], %5 \n" + "ld1 {v0.h}[1], [%0], %5 \n" + "ld1 {v1.h}[1], [%0], %5 \n" + "ld1 {v0.h}[2], [%0], %5 \n" + "ld1 {v1.h}[2], [%0], %5 \n" + "ld1 {v0.h}[3], [%0], %5 \n" + "ld1 {v1.h}[3], [%0] \n" + + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" + + "mov %0, %2 \n" + + "st1 {v3.8b}, [%0], %6 \n" + "st1 {v2.8b}, [%0] \n" + + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %w3, %w3, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + "ld1 {v0.b}[0], [%1], %5 \n" + "ld1 {v0.b}[1], [%1], %5 \n" + "ld1 {v0.b}[2], [%1], %5 \n" + "ld1 {v0.b}[3], [%1], %5 \n" + "ld1 {v0.b}[4], [%1], %5 \n" + "ld1 {v0.b}[5], [%1], %5 \n" + "ld1 {v0.b}[6], [%1], %5 \n" + "ld1 {v0.b}[7], [%1] \n" + + "st1 {v0.8b}, [%2] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"((ptrdiff_t)src_stride), // %5 + "r"((ptrdiff_t)dst_stride) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23"); +} + +static const uint8_t kVTbl4x4TransposeDi[32] = { + 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; + +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w4, %w4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" + "mov %0, %1 \n" + + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" + + "mov %0, %2 \n" + + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %w4, #2 \n" + "b.lt 3f \n" + + "cmp %w4, #4 \n" + "b.lt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" + + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" + + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + + "mov %0, %2 \n" + + "st1 {v16.s}[0], [%0], %6 \n" + "st1 {v16.s}[1], [%0], %6 \n" + "st1 {v16.s}[2], [%0], %6 \n" + "st1 {v16.s}[3], [%0], %6 \n" + + "add %0, %2, #4 \n" + "st1 {v18.s}[0], [%0], %6 \n" + "st1 {v18.s}[1], [%0], %6 \n" + "st1 {v18.s}[2], [%0], %6 \n" + "st1 {v18.s}[3], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v17.s}[0], [%0], %7 \n" + "st1 {v17.s}[1], [%0], %7 \n" + "st1 {v17.s}[2], [%0], %7 \n" + "st1 {v17.s}[3], [%0], %7 \n" + + "add %0, %3, #4 \n" + "st1 {v19.s}[0], [%0], %7 \n" + "st1 {v19.s}[1], [%0], %7 \n" + "st1 {v19.s}[2], [%0], %7 \n" + "st1 {v19.s}[3], [%0] \n" + + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %w4, %w4, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w4, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + "ld2 {v2.h, v3.h}[3], [%0] \n" + + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" + + "mov %0, %2 \n" + + "st1 {v4.d}[0], [%0], %6 \n" + "st1 {v6.d}[0], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v5.d}[0], [%0], %7 \n" + "st1 {v7.d}[0], [%0] \n" + + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %w4, %w4, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + "ld2 {v0.b, v1.b}[7], [%1] \n" + + "st1 {v0.d}[0], [%2] \n" + "st1 {v1.d}[0], [%3] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"((ptrdiff_t)src_stride), // %5 + "r"((ptrdiff_t)dst_stride_a), // %6 + "r"((ptrdiff_t)dst_stride_b), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); +} + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n" + "subs %w8, %w8, #4 \n" // w -= 4 + "st1 {v0.4s}, [%4], 16 \n" + "st1 {v1.4s}, [%5], 16 \n" + "st1 {v2.4s}, [%6], 16 \n" + "st1 {v3.4s}, [%7], 16 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(dst1), // %5 + "+r"(dst2), // %6 + "+r"(dst3), // %7 + "+r"(width) // %8 + : "r"((ptrdiff_t)(src_stride * 4)) // %9 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/rotate_win.cc b/source/rotate_win.cc new file mode 100644 index 00000000..a78873f8 --- /dev/null +++ b/source/rotate_win.cc @@ -0,0 +1,253 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) + +__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + __asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + + // Read in the data from the source pointer. + // First round of bit swap. + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + sub ecx, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + jg convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int w) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + + align 4 + // Read in the data from the source pointer. + // First round of bit swap. + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqu xmm2, [eax] + movdqu xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqu xmm4, [eax] + movdqu xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqu xmm6, [eax] + movdqu xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqu [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqu xmm5, [esp] // restore xmm5 + movdqu [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqu xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + sub ecx, 8 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + jg convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_any.cc b/source/row_any.cc new file mode 100644 index 00000000..e574543c --- /dev/null +++ b/source/row_any.cc @@ -0,0 +1,2459 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include // For memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// memset for vin is meant to clear the source buffer so that +// SIMD that reads full multiple of 16 bytes will not trigger msan errors. +// memset is not needed for production, as the garbage values are processed but +// not used, although there may be edge cases for subsampling. +// The size of the buffer is based on the largest read, which can be inferred +// by the source type (e.g. ARGB) and the mask (last parameter), or by examining +// the source code for how much the source pointers are advanced. + +// Subsampled source needs to be increase by 1 of not even. +#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) + +// Any 4 planes to 1 +#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[64 * 4]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \ + } \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 192, a_buf + n, r); \ + ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_MERGEARGBROW_SSE2 +ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7) +#endif +#ifdef HAS_MERGEARGBROW_AVX2 +ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_MERGEARGBROW_NEON +ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) +#endif + +// Note that odd width replication includes 444 due to implementation +// on arm that subsamples 444 to 422 internally. +// Any 4 planes to 1 with yuvconstants +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t vin[64 * 4]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 192, a_buf + n, r); \ + if (width & 1) { \ + vin[64 + SS(r, UVSHIFT)] = vin[64 + SS(r, UVSHIFT) - 1]; \ + vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I444ALPHATOARGBROW_SSSE3 +ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7) +#endif +#ifdef HAS_I444ALPHATOARGBROW_AVX2 +ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444ALPHATOARGBROW_NEON +ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_NEON +ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) +#endif +#ifdef HAS_I444ALPHATOARGBROW_MSA +ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_MSA +ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_LSX +ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15) +#endif +#ifdef HAS_I422ALPHATOARGBROW_LASX +ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15) +#endif +#undef ANY41C + +// Any 4 planes to 1 plane of 8 bit with yuvconstants +#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T vin[16 * 4]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r * SBPP); \ + memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(vin + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +ANY41CT(I210AlphaToARGBRow_Any_SSSE3, + I210AlphaToARGBRow_SSSE3, + 1, + 0, + uint16_t, + 2, + 4, + 7) +#endif + +#ifdef HAS_I210ALPHATOARGBROW_AVX2 +ANY41CT(I210AlphaToARGBRow_Any_AVX2, + I210AlphaToARGBRow_AVX2, + 1, + 0, + uint16_t, + 2, + 4, + 15) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +ANY41CT(I410AlphaToARGBRow_Any_SSSE3, + I410AlphaToARGBRow_SSSE3, + 0, + 0, + uint16_t, + 2, + 4, + 7) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_AVX2 +ANY41CT(I410AlphaToARGBRow_Any_AVX2, + I410AlphaToARGBRow_AVX2, + 0, + 0, + uint16_t, + 2, + 4, + 15) +#endif + +#undef ANY41CT + +// Any 4 planes to 1 plane with parameter +#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE vin[16 * 4]); \ + SIMD_ALIGNED(DTYPE vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ + } \ + memcpy(vin, r_buf + n, r * SBPP); \ + memcpy(vin + 16, g_buf + n, r * SBPP); \ + memcpy(vin + 32, b_buf + n, r * SBPP); \ + memcpy(vin + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_MERGEAR64ROW_AVX2 +ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) +#endif + +#ifdef HAS_MERGEAR64ROW_NEON +ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +ANY41PT(MergeARGB16To8Row_Any_AVX2, + MergeARGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_NEON +ANY41PT(MergeARGB16To8Row_Any_NEON, + MergeARGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) +#endif + +#undef ANY41PT + +// Any 3 planes to 1. +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64 * 3]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(vin, vin + 64, vin + 128, vout, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +// Merge functions. +#ifdef HAS_MERGERGBROW_SSSE3 +ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGERGBROW_NEON +ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGEXRGBROW_SSE2 +ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) +#endif +#ifdef HAS_MERGEXRGBROW_AVX2 +ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_MERGEXRGBROW_NEON +ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_SSE2 +ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) +ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_AVX2 +ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) +ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) +#endif +#ifdef HAS_I422TOYUY2ROW_NEON +ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_MSA +ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) +#endif +#ifdef HAS_I422TOYUY2ROW_LSX +ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_LASX +ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31) +#endif +#ifdef HAS_I422TOUYVYROW_NEON +ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOUYVYROW_MSA +ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) +#endif +#ifdef HAS_I422TOUYVYROW_LSX +ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOUYVYROW_LASX +ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) +#endif +#ifdef HAS_BLENDPLANEROW_AVX2 +ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) +#endif +#ifdef HAS_BLENDPLANEROW_SSSE3 +ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) +#endif +#undef ANY31 + +// Note that odd width replication includes 444 due to implementation +// on arm that subsamples 444 to 422 internally. +// Any 3 planes to 1 with yuvconstants +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 3]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1]; \ + vin[256 + SS(r, UVSHIFT)] = vin[256 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(vin, vin + 128, vin + 256, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I422TOARGBROW_SSSE3 +ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TORGBAROW_SSSE3 +ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOARGB4444ROW_SSSE3 +ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGB1555ROW_SSSE3 +ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TORGB565ROW_SSSE3 +ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TORGB24ROW_SSSE3 +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) +#endif +#ifdef HAS_I422TOAR30ROW_SSSE3 +ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOAR30ROW_AVX2 +ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444TOARGBROW_SSSE3 +ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) +#endif +#ifdef HAS_I444TORGB24ROW_SSSE3 +ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15) +#endif +#ifdef HAS_I422TORGB24ROW_AVX2 +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) +#endif +#ifdef HAS_I422TOARGBROW_AVX2 +ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422TOARGBROW_AVX512BW +ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31) +#endif +#ifdef HAS_I422TORGBAROW_AVX2 +ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444TOARGBROW_AVX2 +ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_I444TORGB24ROW_AVX2 +ANY31C(I444ToRGB24Row_Any_AVX2, I444ToRGB24Row_AVX2, 0, 0, 3, 31) +#endif +#ifdef HAS_I422TOARGB4444ROW_AVX2 +ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) +#endif +#ifdef HAS_I422TOARGB1555ROW_AVX2 +ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) +#endif +#ifdef HAS_I422TORGB565ROW_AVX2 +ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) +#endif +#ifdef HAS_I444TORGB24ROW_NEON +ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7) +#endif +#ifdef HAS_I422TOARGBROW_NEON +ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) +ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGBROW_MSA +ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) +ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGBROW_LSX +ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15) +ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15) +ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15) +ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15) +ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15) +ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15) +#endif +#ifdef HAS_I422TOARGBROW_LASX +ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31) +ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31) +ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31) +ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31) +ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31) +ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31) +#endif +#ifdef HAS_I444TOARGBROW_LSX +ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15) +#endif +#undef ANY31C + +// Any 3 planes of 16 bit to 1 with yuvconstants +// TODO(fbarchard): consider sharing this code with ANY31C +#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T vin[16 * 3]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r * SBPP); \ + memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210TOAR30ROW_SSSE3 +ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_SSSE3 +ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_AVX2 +ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I210TOAR30ROW_AVX2 +ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I410TOAR30ROW_SSSE3 +ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_SSSE3 +ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_AVX2 +ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I410TOAR30ROW_AVX2 +ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I212TOAR30ROW_SSSE3 +ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_SSSE3 +ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_AVX2 +ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I212TOAR30ROW_AVX2 +ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#undef ANY31CT + +// Any 3 planes to 1 plane with parameter +#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE vin[16 * 3]); \ + SIMD_ALIGNED(DTYPE vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ + } \ + memcpy(vin, r_buf + n, r * SBPP); \ + memcpy(vin + 16, g_buf + n, r * SBPP); \ + memcpy(vin + 32, b_buf + n, r * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vout, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_MERGEXR30ROW_AVX2 +ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +#endif + +#ifdef HAS_MERGEXR30ROW_NEON +ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) +ANY31PT(MergeXR30Row_10_Any_NEON, + MergeXR30Row_10_NEON, + uint16_t, + 2, + uint8_t, + 4, + 3) +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) +#endif + +#ifdef HAS_MERGEXR64ROW_NEON +ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +ANY31PT(MergeXRGB16To8Row_Any_AVX2, + MergeXRGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_NEON +ANY31PT(MergeXRGB16To8Row_Any_NEON, + MergeXRGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) +#endif + +#undef ANY31PT + +// Any 2 planes to 1. +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(vin, y_buf + n * SBPP, r * SBPP); \ + memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(vin, vin + 128, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +// Merge functions. +#ifdef HAS_MERGEUVROW_SSE2 +ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX2 +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX512BW +ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31) +#endif +#ifdef HAS_MERGEUVROW_NEON +ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_MSA +ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_LSX +ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_NV21TOYUV24ROW_NEON +ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TOYUV24ROW_SSSE3 +ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TOYUV24ROW_AVX2 +ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) +#endif +// Math functions. +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_SSE2 +ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_NEON +ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_NEON +ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_MSA +ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_LSX +ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_LASX +ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_MSA +ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_LSX +ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_LASX +ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_MSA +ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_LSX +ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_LASX +ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_SOBELROW_SSE2 +ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELROW_NEON +ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) +#endif +#ifdef HAS_SOBELROW_MSA +ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELROW_LSX +ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_SSE2 +ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_NEON +ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_MSA +ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) +#endif +#ifdef HAS_SOBELTOPLANEROW_LSX +ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31) +#endif +#ifdef HAS_SOBELXYROW_SSE2 +ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELXYROW_NEON +ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) +#endif +#ifdef HAS_SOBELXYROW_MSA +ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELXYROW_LSX +ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15) +#endif +#undef ANY21 + +// Any 2 planes to 1 with stride +// width is measured in source pixels. 4 bytes contains 2 pixels +#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[32 * 2]); \ + SIMD_ALIGNED(uint8_t vout[32]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int awidth = (width + 1) / 2; \ + int r = awidth & MASK; \ + int n = awidth & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2); \ + } \ + memcpy(vin, src_yuy2 + n * SBPP, r * SBPP); \ + memcpy(vin + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP); \ + ANY_SIMD(vin, 32, vout, MASK + 1); \ + memcpy(dst_uv + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_YUY2TONVUVROW_NEON +ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7) +#endif +#ifdef HAS_YUY2TONVUVROW_SSE2 +ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7) +#endif +#ifdef HAS_YUY2TONVUVROW_AVX2 +ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) +#endif + +// Any 2 planes to 1 with yuvconstants +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n * SBPP, r * SBPP); \ + memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(vin, vin + 128, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +// Biplanar to RGB. +#ifdef HAS_NV12TOARGBROW_SSSE3 +ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_AVX2 +ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV12TOARGBROW_NEON +ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_MSA +ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_LSX +ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_LASX +ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV21TOARGBROW_SSSE3 +ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_AVX2 +ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV21TOARGBROW_NEON +ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_MSA +ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_LSX +ANY21C(NV21ToARGBRow_Any_LSX, NV21ToARGBRow_LSX, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_LASX +ANY21C(NV21ToARGBRow_Any_LASX, NV21ToARGBRow_LASX, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV12TORGB24ROW_NEON +ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV21TORGB24ROW_NEON +ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_SSSE3 +ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TORGB24ROW_SSSE3 +ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV12TORGB24ROW_AVX2 +ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV21TORGB24ROW_AVX2 +ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV12TORGB565ROW_SSSE3 +ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_AVX2 +ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) +#endif +#ifdef HAS_NV12TORGB565ROW_NEON +ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_MSA +ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_LSX +ANY21C(NV12ToRGB565Row_Any_LSX, NV12ToRGB565Row_LSX, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_LASX +ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15) +#endif +#undef ANY21C + +// Any 2 planes of 16 bit to 1 with yuvconstants +#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(T vin[16 * 2]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r * SBPP); \ + memcpy(vin + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ + ANY_SIMD(vin, vin + 16, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_P210TOAR30ROW_SSSE3 +ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_SSSE3 +ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_AVX2 +ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P210TOAR30ROW_AVX2 +ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_SSSE3 +ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_SSSE3 +ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_AVX2 +ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_AVX2 +ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif + +#undef ANY21CT + +// Any 2 16 bit planes with parameter to 1 +#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ + int width) { \ + SIMD_ALIGNED(T vin[16 * 2]); \ + SIMD_ALIGNED(T vout[16]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_u, src_v, dst_uv, depth, n); \ + } \ + memcpy(vin, src_u + n, r * BPP); \ + memcpy(vin + 16, src_v + n, r * BPP); \ + ANY_SIMD(vin, vin + 16, vout, depth, MASK + 1); \ + memcpy(dst_uv + n * 2, vout, r * BPP * 2); \ + } + +#ifdef HAS_MERGEUVROW_16_AVX2 +ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7) +#endif +#ifdef HAS_MERGEUVROW_16_NEON +ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) +#endif + +#undef ANY21CT + +// Any 1 to 1. +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_COPYROW_AVX +ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) +#endif +#ifdef HAS_COPYROW_SSE2 +ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) +#endif +#ifdef HAS_COPYROW_NEON +ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_SSSE3) +ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) +ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) +ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) +ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) +ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) +#endif +#if defined(HAS_ABGRTOAR30ROW_SSSE3) +ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ARGBTOAR30ROW_SSSE3) +ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) +ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) +ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_J400TOARGBROW_SSE2) +ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) +#endif +#if defined(HAS_J400TOARGBROW_AVX2) +ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) +#endif +#if defined(HAS_RGB24TOARGBROW_SSSE3) +ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) +#endif +#if defined(HAS_RAWTORGBAROW_SSSE3) +ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) +#endif +#if defined(HAS_RAWTORGB24ROW_SSSE3) +ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) +ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) +ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) +ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) +ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) +#endif +#if defined(HAS_ARGBTORGB24ROW_MSA) +ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) +#endif +#if defined(HAS_ARGBTORGB24ROW_LSX) +ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7) +#endif +#if defined(HAS_ARGBTORGB24ROW_LASX) +ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31) +ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31) +ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15) +ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15) +ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15) +#endif +#if defined(HAS_J400TOARGBROW_LSX) +ANY11(J400ToARGBRow_Any_LSX, J400ToARGBRow_LSX, 0, 1, 4, 15) +#endif +#if defined(HAS_RAWTORGB24ROW_NEON) +ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) +#endif +#if defined(HAS_RAWTORGB24ROW_MSA) +ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) +#endif +#if defined(HAS_RAWTORGB24ROW_LSX) +ANY11(RAWToRGB24Row_Any_LSX, RAWToRGB24Row_LSX, 0, 3, 3, 15) +#endif +#ifdef HAS_ARGBTOYROW_AVX2 +ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ABGRTOYROW_AVX2 +ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYJROW_AVX2 +ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ABGRTOYJROW_AVX2 +ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_RGBATOYJROW_AVX2 +ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_AVX2 +ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) +#endif +#ifdef HAS_YUY2TOYROW_AVX2 +ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 +ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 +ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) +ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_SSE2 +ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) +ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_SSSE3 +ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYJROW_SSSE3 +ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYJROW_SSSE3 +ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_NEON +ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_MSA +ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_LSX +ANY11(ARGBToYRow_Any_LSX, ARGBToYRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_LASX +ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYJROW_NEON +ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYJROW_NEON +ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYJROW_NEON +ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_MSA +ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_LSX +ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYJROW_LSX +ANY11(RGBAToYJRow_Any_LSX, RGBAToYJRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYJROW_LSX +ANY11(ABGRToYJRow_Any_LSX, ABGRToYJRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYJROW_LASX +ANY11(RGBAToYJRow_Any_LASX, RGBAToYJRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYJROW_LASX +ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_ABGRTOYJROW_LASX +ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_BGRATOYROW_NEON +ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_MSA +ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_LSX +ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_LASX +ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_ABGRTOYROW_NEON +ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYROW_MSA +ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) +#endif +#ifdef HAS_ABGRTOYROW_LSX +ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYROW_LASX +ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_RGBATOYROW_NEON +ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYROW_MSA +ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYROW_LSX +ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYROW_LASX +ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) +#endif +#ifdef HAS_RGB24TOYROW_NEON +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_AVX2 +ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) +#endif +#ifdef HAS_RGB24TOYJROW_SSSE3 +ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_NEON +ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYROW_MSA +ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYROW_LSX +ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_LSX +ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_LASX +ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31) +#endif +#ifdef HAS_RGB24TOYROW_LASX +ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) +#endif +#ifdef HAS_RAWTOYROW_NEON +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_AVX2 +ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) +#endif +#ifdef HAS_RAWTOYJROW_SSSE3 +ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_NEON +ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYROW_MSA +ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYROW_LSX +ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYROW_LASX +ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31) +#endif +#ifdef HAS_RAWTOYJROW_LSX +ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_LASX +ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31) +#endif +#ifdef HAS_RGB565TOYROW_NEON +ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_RGB565TOYROW_MSA +ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) +#endif +#ifdef HAS_RGB565TOYROW_LSX +ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15) +#endif +#ifdef HAS_RGB565TOYROW_LASX +ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON +ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_ARGB1555TOYROW_MSA +ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) +#endif +#ifdef HAS_ARGB1555TOYROW_LSX +ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15) +#endif +#ifdef HAS_ARGB1555TOYROW_LASX +ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON +ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_YUY2TOYROW_NEON +ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_UYVYTOYROW_NEON +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_MSA +ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_YUY2TOYROW_LSX +ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_LASX +ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_MSA +ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_LSX +ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15) +#endif +#ifdef HAS_UYVYTOYROW_LASX +ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) +#endif +#ifdef HAS_AYUVTOYROW_NEON +ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_SWAPUVROW_SSSE3 +ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) +#endif +#ifdef HAS_SWAPUVROW_AVX2 +ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) +#endif +#ifdef HAS_SWAPUVROW_NEON +ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON +ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RGB24TOARGBROW_MSA +ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) +#endif +#ifdef HAS_RGB24TOARGBROW_LSX +ANY11(RGB24ToARGBRow_Any_LSX, RGB24ToARGBRow_LSX, 0, 3, 4, 15) +#endif +#ifdef HAS_RGB24TOARGBROW_LASX +ANY11(RGB24ToARGBRow_Any_LASX, RGB24ToARGBRow_LASX, 0, 3, 4, 31) +#endif +#ifdef HAS_RAWTOARGBROW_NEON +ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RAWTORGBAROW_NEON +ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RAWTOARGBROW_MSA +ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) +#endif +#ifdef HAS_RAWTOARGBROW_LSX +ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15) +#endif +#ifdef HAS_RAWTOARGBROW_LASX +ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON +ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_RGB565TOARGBROW_MSA +ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) +#endif +#ifdef HAS_RGB565TOARGBROW_LSX +ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15) +#endif +#ifdef HAS_RGB565TOARGBROW_LASX +ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON +ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB1555TOARGBROW_MSA +ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB1555TOARGBROW_LSX +ANY11(ARGB1555ToARGBRow_Any_LSX, ARGB1555ToARGBRow_LSX, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB1555TOARGBROW_LASX +ANY11(ARGB1555ToARGBRow_Any_LASX, ARGB1555ToARGBRow_LASX, 0, 2, 4, 31) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON +ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB4444TOARGBROW_MSA +ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB4444TOARGBROW_LSX +ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB4444TOARGBROW_LASX +ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_AVX2 +ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_NEON +ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_MSA +ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_LSX +ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_LASX +ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_NEON +ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MSA +ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_LSX +ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15) +#endif +#undef ANY11 + +// Any 1 to 1 blended. Destination is read, modify, write. +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + memset(vout, 0, sizeof(vout)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(vout, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) +#endif +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) +#endif +#undef ANY11B + +// Any 1 to 1 with parameter. +#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(vin, vout, param, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11P(I400ToARGBRow_Any_SSE2, + I400ToARGBRow_SSE2, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11P(I400ToARGBRow_Any_AVX2, + I400ToARGBRow_AVX2, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_NEON) +ANY11P(I400ToARGBRow_Any_NEON, + I400ToARGBRow_NEON, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_MSA) +ANY11P(I400ToARGBRow_Any_MSA, + I400ToARGBRow_MSA, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_LSX) +ANY11P(I400ToARGBRow_Any_LSX, + I400ToARGBRow_LSX, + const struct YuvConstants*, + 1, + 4, + 15) +#endif + +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, + ARGBToRGB565DitherRow_SSE2, + const uint32_t, + 4, + 2, + 3) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, + ARGBToRGB565DitherRow_AVX2, + const uint32_t, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, + ARGBToRGB565DitherRow_NEON, + const uint32_t, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) +ANY11P(ARGBToRGB565DitherRow_Any_MSA, + ARGBToRGB565DitherRow_MSA, + const uint32_t, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) +ANY11P(ARGBToRGB565DitherRow_Any_LSX, + ARGBToRGB565DitherRow_LSX, + const uint32_t, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) +ANY11P(ARGBToRGB565DitherRow_Any_LASX, + ARGBToRGB565DitherRow_LASX, + const uint32_t, + 4, + 2, + 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_NEON +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_MSA +ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_LSX +ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_LASX +ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) +#endif +#undef ANY11P +#undef ANY11P + +// Any 1 to 1 with type +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]); \ + SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(vin, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ + ANY_SIMD((STYPE*)vin, (DTYPE*)vout, MASK + 1); \ + memcpy((uint8_t*)(dst_ptr) + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_ARGBTOAB64ROW_SSSE3 +ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_AR64TOARGBROW_SSSE3 +ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_NEON +ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_NEON +ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#undef ANY11T + +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(STYPE vin[32]); \ + SIMD_ALIGNED(DTYPE vout[32]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ + } \ + memcpy(vin, src_ptr + n, r * SBPP); \ + ANY_SIMD(vin, vout, scale, MASK + 1); \ + memcpy(dst_ptr + n, vout, r * BPP); \ + } + +#ifdef HAS_CONVERT16TO8ROW_SSSE3 +ANY11C(Convert16To8Row_Any_SSSE3, + Convert16To8Row_SSSE3, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif +#ifdef HAS_CONVERT16TO8ROW_AVX2 +ANY11C(Convert16To8Row_Any_AVX2, + Convert16To8Row_AVX2, + 2, + 1, + uint16_t, + uint8_t, + 31) +#endif +#ifdef HAS_CONVERT16TO8ROW_NEON +ANY11C(Convert16To8Row_Any_NEON, + Convert16To8Row_NEON, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_SSE2 +ANY11C(Convert8To16Row_Any_SSE2, + Convert8To16Row_SSE2, + 1, + 2, + uint8_t, + uint16_t, + 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_AVX2 +ANY11C(Convert8To16Row_Any_AVX2, + Convert8To16Row_AVX2, + 1, + 2, + uint8_t, + uint16_t, + 31) +#endif +#ifdef HAS_MULTIPLYROW_16_AVX2 +ANY11C(MultiplyRow_16_Any_AVX2, + MultiplyRow_16_AVX2, + 2, + 2, + uint16_t, + uint16_t, + 31) +#endif +#ifdef HAS_MULTIPLYROW_16_NEON +ANY11C(MultiplyRow_16_Any_NEON, + MultiplyRow_16_NEON, + 2, + 2, + uint16_t, + uint16_t, + 15) +#endif +#ifdef HAS_DIVIDEROW_16_AVX2 +ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31) +#endif +#ifdef HAS_DIVIDEROW_16_NEON +ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) +#endif +#undef ANY11C + +// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ + void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ + SIMD_ALIGNED(ST vin[32]); \ + SIMD_ALIGNED(T vout[32]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(vin, src_ptr + n, r * SBPP); \ + ANY_SIMD(vin, vout, param, MASK + 1); \ + memcpy(dst_ptr + n, vout, r * BPP); \ + } + +#ifdef HAS_HALFFLOATROW_SSE2 +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) +#endif +#ifdef HAS_HALFFLOATROW_AVX2 +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) +#endif +#ifdef HAS_HALFFLOATROW_F16C +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_F16C, + HalfFloat1Row_F16C, + uint16_t, + uint16_t, + 2, + 2, + 15) +#endif +#ifdef HAS_HALFFLOATROW_NEON +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 7) +#endif +#ifdef HAS_HALFFLOATROW_MSA +ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) +#endif +#ifdef HAS_BYTETOFLOATROW_NEON +ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) +#endif +#ifdef HAS_HALFFLOATROW_LSX +ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31) +#endif +#undef ANY11P16 + +// Any 1 to 1 with yuvconstants +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +#if defined(HAS_YUY2TOARGBROW_SSSE3) +ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) +ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) +ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) +ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) +#endif +#if defined(HAS_YUY2TOARGBROW_MSA) +ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) +#endif +#if defined(HAS_YUY2TOARGBROW_LSX) +ANY11C(YUY2ToARGBRow_Any_LSX, YUY2ToARGBRow_LSX, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7) +#endif +#undef ANY11C + +// Any 1 to 1 interpolate. Takes 2 rows of source via stride. +#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS vin[64 * 2]); \ + SIMD_ALIGNED(TD vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ + } \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \ + } + +#ifdef HAS_INTERPOLATEROW_AVX2 +ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) +#endif +#ifdef HAS_INTERPOLATEROW_SSSE3 +ANY11I(InterpolateRow_Any_SSSE3, + InterpolateRow_SSSE3, + uint8_t, + uint8_t, + 1, + 1, + 15) +#endif +#ifdef HAS_INTERPOLATEROW_NEON +ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_MSA +ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31) +#endif +#ifdef HAS_INTERPOLATEROW_LSX +ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31) +#endif + +#ifdef HAS_INTERPOLATEROW_16_NEON +ANY11I(InterpolateRow_16_Any_NEON, + InterpolateRow_16_NEON, + uint16_t, + uint16_t, + 1, + 1, + 7) +#endif +#undef ANY11I + +// Any 1 to 1 interpolate with scale param +#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int scale, int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS vin[64 * 2]); \ + SIMD_ALIGNED(TD vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \ + } \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(vout, vin, 64, scale, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \ + } + +#ifdef HAS_INTERPOLATEROW_16TO8_NEON +ANY11IS(InterpolateRow_16To8_Any_NEON, + InterpolateRow_16To8_NEON, + uint8_t, + uint16_t, + 1, + 1, + 7) +#endif +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +ANY11IS(InterpolateRow_16To8_Any_AVX2, + InterpolateRow_16To8_AVX2, + uint8_t, + uint16_t, + 1, + 1, + 31) +#endif + +#undef ANY11IS + +// Any 1 to 1 mirror. +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr, r* BPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ + } + +#ifdef HAS_MIRRORROW_AVX2 +ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) +#endif +#ifdef HAS_MIRRORROW_SSSE3 +ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) +#endif +#ifdef HAS_MIRRORROW_NEON +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) +#endif +#ifdef HAS_MIRRORROW_MSA +ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) +#endif +#ifdef HAS_MIRRORROW_LSX +ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31) +#endif +#ifdef HAS_MIRRORROW_LASX +ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) +#endif +#ifdef HAS_MIRRORUVROW_AVX2 +ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) +#endif +#ifdef HAS_MIRRORUVROW_SSSE3 +ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_NEON +ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) +#endif +#ifdef HAS_MIRRORUVROW_MSA +ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_LSX +ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_LASX +ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) +#endif +#ifdef HAS_ARGBMIRRORROW_AVX2 +ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) +#endif +#ifdef HAS_ARGBMIRRORROW_SSE2 +ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) +#endif +#ifdef HAS_ARGBMIRRORROW_NEON +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) +#endif +#ifdef HAS_ARGBMIRRORROW_MSA +ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) +#endif +#ifdef HAS_ARGBMIRRORROW_LSX +ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7) +#endif +#ifdef HAS_ARGBMIRRORROW_LASX +ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) +#endif +#ifdef HAS_RGB24MIRRORROW_SSSE3 +ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) +#endif +#ifdef HAS_RGB24MIRRORROW_NEON +ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) +#endif +#undef ANY11M + +// Any 1 plane. (memset) +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8_t vout[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(vout, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_SETROW_X86 +ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) +#endif +#ifdef HAS_SETROW_NEON +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) +#endif +#ifdef HAS_SETROW_LSX +ANY1(SetRow_Any_LSX, SetRow_LSX, uint8_t, 1, 15) +#endif +#ifdef HAS_ARGBSETROW_NEON +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) +#endif +#ifdef HAS_ARGBSETROW_MSA +ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) +#endif +#ifdef HAS_ARGBSETROW_LSX +ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3) +#endif +#undef ANY1 + +// Any 1 to 2. Outputs UV planes. +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(vin, vout, vout + 128, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), vout, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), vout + 128, SS(r, DUVSHIFT)); \ + } + +#ifdef HAS_SPLITUVROW_SSE2 +ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) +#endif +#ifdef HAS_SPLITUVROW_AVX2 +ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) +#endif +#ifdef HAS_SPLITUVROW_NEON +ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) +#endif +#ifdef HAS_SPLITUVROW_MSA +ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) +#endif +#ifdef HAS_SPLITUVROW_LSX +ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31) +#endif +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_AVX2 +ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) +#endif +#ifdef HAS_YUY2TOUV422ROW_SSE2 +ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_NEON +ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) +ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_MSA +ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_YUY2TOUV422ROW_LSX +ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_LASX +ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31) +ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31) +#endif +#undef ANY12 + +// Any 2 16 bit planes with parameter to 1 +#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \ + SIMD_ALIGNED(T vin[16 * 2]); \ + SIMD_ALIGNED(T vout[16 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \ + } \ + memcpy(vin, src_uv + n * 2, r * BPP * 2); \ + ANY_SIMD(vin, vout, vout + 16, depth, MASK + 1); \ + memcpy(dst_u + n, vout, r * BPP); \ + memcpy(dst_v + n, vout + 16, r * BPP); \ + } + +#ifdef HAS_SPLITUVROW_16_AVX2 +ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15) +#endif + +#ifdef HAS_SPLITUVROW_16_NEON +ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) +#endif + +#undef ANY21CT + +// Any 1 to 3. Outputs RGB planes. +#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, int width) { \ + SIMD_ALIGNED(uint8_t vin[16 * 3]); \ + SIMD_ALIGNED(uint8_t vout[16 * 3]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ + } \ + memcpy(vin, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(vin, vout, vout + 16, vout + 32, MASK + 1); \ + memcpy(dst_r + n, vout, r); \ + memcpy(dst_g + n, vout + 16, r); \ + memcpy(dst_b + n, vout + 32, r); \ + } + +#ifdef HAS_SPLITRGBROW_SSSE3 +ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) +#endif +#ifdef HAS_SPLITRGBROW_NEON +ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) +#endif +#ifdef HAS_SPLITXRGBROW_SSE2 +ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) +#endif +#ifdef HAS_SPLITXRGBROW_SSSE3 +ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) +#endif +#ifdef HAS_SPLITXRGBROW_AVX2 +ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) +#endif +#ifdef HAS_SPLITXRGBROW_NEON +ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) +#endif + +// Any 1 to 4. Outputs ARGB planes. +#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, uint8_t* dst_a, int width) { \ + SIMD_ALIGNED(uint8_t vin[16 * 4]); \ + SIMD_ALIGNED(uint8_t vout[16 * 4]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \ + } \ + memcpy(vin, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(vin, vout, vout + 16, vout + 32, vout + 48, MASK + 1); \ + memcpy(dst_r + n, vout, r); \ + memcpy(dst_g + n, vout + 16, r); \ + memcpy(dst_b + n, vout + 32, r); \ + memcpy(dst_a + n, vout + 48, r); \ + } + +#ifdef HAS_SPLITARGBROW_SSE2 +ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7) +#endif +#ifdef HAS_SPLITARGBROW_SSSE3 +ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7) +#endif +#ifdef HAS_SPLITARGBROW_AVX2 +ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15) +#endif +#ifdef HAS_SPLITARGBROW_NEON +ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) +#endif + +// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ + uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ + vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1); \ + memcpy(dst_u + (n >> 1), vout, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1)); \ + } + +#ifdef HAS_ARGBTOUVROW_AVX2 +ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ABGRTOUVROW_AVX2 +ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_AVX2 +ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ABGRTOUVJROW_AVX2 +ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_SSSE3 +ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVJROW_SSSE3 +ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) +ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) +ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) +ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_AVX2 +ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) +ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) +#endif +#ifdef HAS_YUY2TOUVROW_SSE2 +ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) +ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_NEON +ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_MSA +ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVROW_LSX +ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_LASX +ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON +ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVJROW_NEON +ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_MSA +ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_LSX +ANY12S(ARGBToUVJRow_Any_LSX, ARGBToUVJRow_LSX, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_LASX +ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31) +#endif +#ifdef HAS_BGRATOUVROW_NEON +ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_MSA +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_LSX +ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_NEON +ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_MSA +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_LSX +ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_NEON +ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_MSA +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_LSX +ANY12S(RGBAToUVRow_Any_LSX, RGBAToUVRow_LSX, 0, 4, 15) +#endif +#ifdef HAS_RGB24TOUVROW_NEON +ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RGB24TOUVJROW_NEON +ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RGB24TOUVROW_MSA +ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) +#endif +#ifdef HAS_RGB24TOUVROW_LSX +ANY12S(RGB24ToUVRow_Any_LSX, RGB24ToUVRow_LSX, 0, 3, 15) +#endif +#ifdef HAS_RGB24TOUVROW_LASX +ANY12S(RGB24ToUVRow_Any_LASX, RGB24ToUVRow_LASX, 0, 3, 31) +#endif +#ifdef HAS_RAWTOUVROW_NEON +ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVJROW_NEON +ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_MSA +ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_LSX +ANY12S(RAWToUVRow_Any_LSX, RAWToUVRow_LSX, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_LASX +ANY12S(RAWToUVRow_Any_LASX, RAWToUVRow_LASX, 0, 3, 31) +#endif +#ifdef HAS_RGB565TOUVROW_NEON +ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_RGB565TOUVROW_MSA +ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) +#endif +#ifdef HAS_RGB565TOUVROW_LSX +ANY12S(RGB565ToUVRow_Any_LSX, RGB565ToUVRow_LSX, 0, 2, 15) +#endif +#ifdef HAS_RGB565TOUVROW_LASX +ANY12S(RGB565ToUVRow_Any_LASX, RGB565ToUVRow_LASX, 0, 2, 31) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON +ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_MSA +ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_LSX +ANY12S(ARGB1555ToUVRow_Any_LSX, ARGB1555ToUVRow_LSX, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_LASX +ANY12S(ARGB1555ToUVRow_Any_LASX, ARGB1555ToUVRow_LASX, 0, 2, 31) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON +ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON +ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON +ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_MSA +ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_YUY2TOUVROW_LSX +ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_LASX +ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_MSA +ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_LSX +ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15) +#endif +#ifdef HAS_UYVYTOUVROW_LASX +ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) +#endif +#undef ANY12S + +// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ + vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(vin, 128, vout, MASK + 1); \ + memcpy(dst_vu + (n >> 1) * 2, vout, SS(r, 1) * 2); \ + } + +#ifdef HAS_AYUVTOVUROW_NEON +ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) +ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) +#endif +#undef ANY11S + +#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \ + SIMD_ALIGNED(T vin[16]); \ + SIMD_ALIGNED(T vout[16]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src, src_tile_stride, dst, n); \ + } \ + memcpy(vin, src + (n / 16) * src_tile_stride, r * BPP); \ + ANY_SIMD(vin, src_tile_stride, vout, MASK + 1); \ + memcpy(dst + n, vout, r * BPP); \ + } + +#ifdef HAS_DETILEROW_NEON +ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15) +#endif +#ifdef HAS_DETILEROW_SSE2 +ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15) +#endif +#ifdef HAS_DETILEROW_16_NEON +ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15) +#endif +#ifdef HAS_DETILEROW_16_SSE2 +ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15) +#endif +#ifdef HAS_DETILEROW_16_AVX +ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) +#endif + +// DetileSplitUVRow width is in bytes +#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t vin[16]); \ + SIMD_ALIGNED(uint8_t vout[8 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \ + } \ + memcpy(vin, src_uv + (n / 16) * src_tile_stride, r); \ + ANY_SIMD(vin, src_tile_stride, vout, vout + 8, r); \ + memcpy(dst_u + n / 2, vout, (r + 1) / 2); \ + memcpy(dst_v + n / 2, vout + 8, (r + 1) / 2); \ + } + +#ifdef HAS_DETILESPLITUVROW_NEON +ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) +#endif +#ifdef HAS_DETILESPLITUVROW_SSSE3 +ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) +#endif + +#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, \ + const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, \ + uint8_t* dst_yuy2, int width) { \ + SIMD_ALIGNED(uint8_t vin[16 * 2]); \ + SIMD_ALIGNED(uint8_t vout[16 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \ + n); \ + } \ + memcpy(vin, src_y + (n / 16) * src_y_tile_stride, r); \ + memcpy(vin + 16, src_uv + (n / 16) * src_uv_tile_stride, r); \ + ANY_SIMD(vin, src_y_tile_stride, vin + 16, src_uv_tile_stride, vout, r); \ + memcpy(dst_yuy2 + 2 * n, vout, 2 * r); \ + } + +#ifdef HAS_DETILETOYUY2_NEON +ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) +#endif + +#ifdef HAS_DETILETOYUY2_SSE2 +ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15) +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_common.cc b/source/row_common.cc new file mode 100644 index 00000000..8be37fb5 --- /dev/null +++ b/source/row_common.cc @@ -0,0 +1,4547 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include +#include // For memcpy and memset. + +#include "libyuv/basic_types.h" +#include "libyuv/convert_argb.h" // For kYuvI601Constants + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef __cplusplus +#define STATIC_CAST(type, expr) static_cast(expr) +#else +#define STATIC_CAST(type, expr) (type)(expr) +#endif + +// This macro controls YUV to RGB using unsigned math to extend range of +// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: +// LIBYUV_UNLIMITED_DATA + +// Macros to enable unlimited data for each colorspace +// LIBYUV_UNLIMITED_BT601 +// LIBYUV_UNLIMITED_BT709 +// LIBYUV_UNLIMITED_BT2020 + +// The following macro from row_win makes the C code match the row_win code, +// which is 7 bit fixed point for ARGBToI420: +#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ + defined(_MSC_VER) && !defined(__clang__) && \ + (defined(_M_IX86) || defined(_M_X64)) +#define LIBYUV_RGB7 1 +#endif + +#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86)) +#define LIBYUV_ARGBTOUV_PAVGB 1 +#define LIBYUV_RGBTOU_TRUNCATE 1 +#define LIBYUV_ATTENUATE_DUP 1 +#endif +#if defined(LIBYUV_BIT_EXACT) +#define LIBYUV_UNATTENUATE_DUP 1 +#endif + +// llvm x86 is poor at ternary operator, so use branchless min/max. + +#define USE_BRANCHLESS 1 +#if USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { + return -(v >= 0) & v; +} +// TODO(fbarchard): make clamp255 preserve negative values. +static __inline int32_t clamp255(int32_t v) { + return (-(v >= 255) | v) & 255; +} + +static __inline int32_t clamp1023(int32_t v) { + return (-(v >= 1023) | v) & 1023; +} + +// clamp to max +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (-(v >= max) | v) & max; +} + +static __inline uint32_t Abs(int32_t v) { + int m = -(v < 0); + return (v + m) ^ m; +} +#else // USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { + return (v < 0) ? 0 : v; +} + +static __inline int32_t clamp255(int32_t v) { + return (v > 255) ? 255 : v; +} + +static __inline int32_t clamp1023(int32_t v) { + return (v > 1023) ? 1023 : v; +} + +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (v > max) ? max : v; +} + +static __inline uint32_t Abs(int32_t v) { + return (v < 0) ? -v : v; +} +#endif // USE_BRANCHLESS +static __inline uint32_t Clamp(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp255(v)); +} + +static __inline uint32_t Clamp10(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp1023(v)); +} + +// Little Endian +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define WRITEWORD(p, v) *(uint32_t*)(p) = v +#else +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 255); + p[1] = (uint8_t)((v >> 8) & 255); + p[2] = (uint8_t)((v >> 16) & 255); + p[3] = (uint8_t)((v >> 24) & 255); +} +#endif + +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb24 += 3; + } +} + +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; + dst_rgba[0] = 255u; + dst_rgba[1] = b; + dst_rgba[2] = g; + dst_rgba[3] = r; + dst_rgba += 4; + src_raw += 3; + } +} + +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + dst_rgb24 += 3; + src_raw += 3; + } +} + +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb565 += 2; + } +} + +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f); + uint8_t g = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2); + uint8_t a = STATIC_CAST(uint8_t, src_argb1555[1] >> 7); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 3) | (g >> 2)); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); + dst_argb[3] = -a; + dst_argb += 4; + src_argb1555 += 2; + } +} + +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = STATIC_CAST(uint8_t, src_argb4444[0] & 0x0f); + uint8_t g = STATIC_CAST(uint8_t, src_argb4444[0] >> 4); + uint8_t r = STATIC_CAST(uint8_t, src_argb4444[1] & 0x0f); + uint8_t a = STATIC_CAST(uint8_t, src_argb4444[1] >> 4); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 4) | b); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 4) | g); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 4) | r); + dst_argb[3] = STATIC_CAST(uint8_t, (a << 4) | a); + dst_argb += 4; + src_argb4444 += 2; + } +} + +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); + dst_argb += 4; + src_ar30 += 4; + } +} + +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); + dst_abgr += 4; + src_ar30 += 4; + } +} + +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); + uint32_t b = ar30 & 0x3ff; + uint32_t ga = ar30 & 0xc00ffc00; + uint32_t r = (ar30 >> 20) & 0x3ff; + *(uint32_t*)(dst_ab30) = r | ga | (b << 20); + dst_ab30 += 4; + src_ar30 += 4; + } +} + +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; + dst_rgb[0] = b; + dst_rgb[1] = g; + dst_rgb[2] = r; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; + dst_rgb[0] = r; + dst_rgb[1] = g; + dst_rgb[2] = b; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 2; + uint8_t r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + } +} + +// dither4 is a row of 4 values from 4x4 dither matrix. +// The 4x4 matrix contains values to increase RGB. When converting to +// fewer bits (565) this provides an ordered dither. +// The order in the 4x4 matrix in first byte is upper left. +// The 4 values are passed as an int, then referenced as an array, so +// endian will not affect order of the original matrix. But the dither4 +// will containing the first pixel in the lower byte for little endian +// or the upper byte for big endian. +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + int dither0 = ((const unsigned char*)(&dither4))[x & 3]; + int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; + uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3); + uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2); + uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3); + uint8_t b1 = STATIC_CAST(uint8_t, clamp255(src_argb[4] + dither1) >> 3); + uint8_t g1 = STATIC_CAST(uint8_t, clamp255(src_argb[5] + dither1) >> 2); + uint8_t r1 = STATIC_CAST(uint8_t, clamp255(src_argb[6] + dither1) >> 3); + *(uint16_t*)(dst_rgb + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + *(uint16_t*)(dst_rgb + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; + uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3); + uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2); + uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3); + *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + } +} + +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 3; + uint8_t r1 = src_argb[6] >> 3; + uint8_t a1 = src_argb[7] >> 7; + *(uint16_t*)(dst_rgb + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15)); + *(uint16_t*)(dst_rgb + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | (a1 << 15)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + *(uint16_t*)(dst_rgb) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15)); + } +} + +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + uint8_t b1 = src_argb[4] >> 4; + uint8_t g1 = src_argb[5] >> 4; + uint8_t r1 = src_argb[6] >> 4; + uint8_t a1 = src_argb[7] >> 4; + *(uint16_t*)(dst_rgb + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12)); + *(uint16_t*)(dst_rgb + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | (a1 << 12)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + *(uint16_t*)(dst_rgb) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12)); + } +} + +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t r0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); + uint32_t b0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t a0 = (src_abgr[3] >> 6); + *(uint32_t*)(dst_ar30) = + STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30)); + dst_ar30 += 4; + src_abgr += 4; + } +} + +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); + uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); + uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); + uint32_t a0 = (src_argb[3] >> 6); + *(uint32_t*)(dst_ar30) = + STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30)); + dst_ar30 += 4; + src_argb += 4; + } +} + +void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + int x; + for (x = 0; x < width; ++x) { + uint16_t b = src_argb[0] * 0x0101; + uint16_t g = src_argb[1] * 0x0101; + uint16_t r = src_argb[2] * 0x0101; + uint16_t a = src_argb[3] * 0x0101; + dst_ar64[0] = b; + dst_ar64[1] = g; + dst_ar64[2] = r; + dst_ar64[3] = a; + dst_ar64 += 4; + src_argb += 4; + } +} + +void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + int x; + for (x = 0; x < width; ++x) { + uint16_t b = src_argb[0] * 0x0101; + uint16_t g = src_argb[1] * 0x0101; + uint16_t r = src_argb[2] * 0x0101; + uint16_t a = src_argb[3] * 0x0101; + dst_ab64[0] = r; + dst_ab64[1] = g; + dst_ab64[2] = b; + dst_ab64[3] = a; + dst_ab64 += 4; + src_argb += 4; + } +} + +void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_ar64[0] >> 8; + uint8_t g = src_ar64[1] >> 8; + uint8_t r = src_ar64[2] >> 8; + uint8_t a = src_ar64[3] >> 8; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_ar64 += 4; + } +} + +void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_ab64[0] >> 8; + uint8_t g = src_ab64[1] >> 8; + uint8_t b = src_ab64[2] >> 8; + uint8_t a = src_ab64[3] >> 8; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_ab64 += 4; + } +} + +// TODO(fbarchard): Make shuffle compatible with SIMD versions +void AR64ShuffleRow_C(const uint8_t* src_ar64, + uint8_t* dst_ar64, + const uint8_t* shuffler, + int width) { + const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64; + uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64; + int index0 = shuffler[0] / 2; + int index1 = shuffler[2] / 2; + int index2 = shuffler[4] / 2; + int index3 = shuffler[6] / 2; + // Shuffle a row of AR64. + int x; + for (x = 0; x < width / 2; ++x) { + // To support in-place conversion. + uint16_t b = src_ar64_16[index0]; + uint16_t g = src_ar64_16[index1]; + uint16_t r = src_ar64_16[index2]; + uint16_t a = src_ar64_16[index3]; + dst_ar64_16[0] = b; + dst_ar64_16[1] = g; + dst_ar64_16[2] = r; + dst_ar64_16[3] = a; + src_ar64_16 += 4; + dst_ar64_16 += 4; + } +} + +#ifdef LIBYUV_RGB7 +// Old 7 bit math for compatibility on unsupported platforms. +static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16); +} +#else +// 8 bit +// Intel SSE/AVX uses the following equivalent formula +// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. +// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + +// 0x7e80) >> 8; + +static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); +} +#endif + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. +#ifdef LIBYUV_RGBTOU_TRUNCATE +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); +} +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); +} +#else +// TODO(fbarchard): Add rounding to x86 SIMD and use this +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); +} +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); +} +#endif + +// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. +#if !defined(LIBYUV_ARGBTOUV_PAVGB) +static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { + return STATIC_CAST( + uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8); +} +static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { + return STATIC_CAST( + uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8); +} +#endif + +// ARGBToY_C and ARGBToUV_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ + } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = src_rgb[B] + src_rgb1[B]; \ + uint16_t ag = src_rgb[G] + src_rgb1[G]; \ + uint16_t ar = src_rgb[R] + src_rgb1[R]; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ + } +#endif + +MAKEROWY(ARGB, 2, 1, 0, 4) +MAKEROWY(BGRA, 1, 2, 3, 4) +MAKEROWY(ABGR, 0, 1, 2, 4) +MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) +#undef MAKEROWY + +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// JPeg 7 bit Y (deprecated) +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit Y: +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 + +#ifdef LIBYUV_RGB7 +// Old 7 bit math for compatibility on unsupported platforms. +static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; +} +#else +// 8 bit +static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (77 * r + 150 * g + 29 * b + 128) >> 8; +} +#endif + +#if defined(LIBYUV_ARGBTOUV_PAVGB) +static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} +static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} +#else +static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; +} +static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; +} +#endif + +// ARGBToYJ_C and ARGBToUVJ_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ + } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + } \ + } + +#endif + +MAKEROWYJ(ARGB, 2, 1, 0, 4) +MAKEROWYJ(ABGR, 0, 1, 2, 4) +MAKEROWYJ(RGBA, 3, 2, 1, 4) +MAKEROWYJ(RGB24, 2, 1, 0, 3) +MAKEROWYJ(RAW, 0, 1, 2, 3) +#undef MAKEROWYJ + +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r = src_rgb565[1] >> 3; + b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); + r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); + dst_y[0] = RGBToY(r, g, b); + src_rgb565 += 2; + dst_y += 1; + } +} + +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; + b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + g = STATIC_CAST(uint8_t, (g << 3) | (g >> 2)); + r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); + dst_y[0] = RGBToY(r, g, b); + src_argb1555 += 2; + dst_y += 1; + } +} + +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; + b = STATIC_CAST(uint8_t, (b << 4) | b); + g = STATIC_CAST(uint8_t, (g << 4) | g); + r = STATIC_CAST(uint8_t, (r << 4) | r); + dst_y[0] = RGBToY(r, g, b); + src_argb4444 += 2; + dst_y += 1; + } +} + +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f); + uint8_t g1 = STATIC_CAST( + uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3)); + uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3); + uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); + uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f); + uint8_t g3 = STATIC_CAST( + uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3)); + uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2)); + g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4)); + r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2)); + g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4)); + r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + } +} + +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2); + uint8_t b1 = STATIC_CAST(uint8_t, src_argb1555[2] & 0x1f); + uint8_t g1 = STATIC_CAST( + uint8_t, (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3)); + uint8_t r1 = STATIC_CAST(uint8_t, (src_argb1555[3] & 0x7c) >> 2); + uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2); + uint8_t b3 = STATIC_CAST(uint8_t, next_argb1555[2] & 0x1f); + uint8_t g3 = STATIC_CAST( + uint8_t, (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3)); + uint8_t r3 = STATIC_CAST(uint8_t, (next_argb1555[3] & 0x7c) >> 2); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2)); + g1 = STATIC_CAST(uint8_t, (g1 << 3) | (g1 >> 2)); + r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2)); + g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2)); + r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2); + uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + } +} + +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b1 = src_argb4444[2] & 0x0f; + uint8_t g1 = src_argb4444[2] >> 4; + uint8_t r1 = src_argb4444[3] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b3 = next_argb4444[2] & 0x0f; + uint8_t g3 = next_argb4444[2] >> 4; + uint8_t r3 = next_argb4444[3] & 0x0f; + + b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0); + g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0); + r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0); + b1 = STATIC_CAST(uint8_t, (b1 << 4) | b1); + g1 = STATIC_CAST(uint8_t, (g1 << 4) | g1); + r1 = STATIC_CAST(uint8_t, (r1 << 4) | r1); + b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2); + g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2); + r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2); + b3 = STATIC_CAST(uint8_t, (b3 << 4) | b3); + g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3); + r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + + b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0); + g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0); + r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0); + b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2); + g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2); + r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + } +} + +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t ab = src_argb[0]; + uint8_t ag = src_argb[1]; + uint8_t ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 4; + dst_u += 1; + dst_v += 1; + } +} + +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = src_argb[3]; + dst_argb += 4; + src_argb += 4; + } +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + dst_argb[0] = STATIC_CAST(uint8_t, sb); + dst_argb[1] = STATIC_CAST(uint8_t, clamp255(sg)); + dst_argb[2] = STATIC_CAST(uint8_t, clamp255(sr)); + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + + a * matrix_argb[3]) >> + 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + + a * matrix_argb[7]) >> + 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + + a * matrix_argb[11]) >> + 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + + a * matrix_argb[15]) >> + 6; + dst_argb[0] = STATIC_CAST(uint8_t, Clamp(sb)); + dst_argb[1] = STATIC_CAST(uint8_t, Clamp(sg)); + dst_argb[2] = STATIC_CAST(uint8_t, Clamp(sr)); + dst_argb[3] = STATIC_CAST(uint8_t, Clamp(sa)); + src_argb += 4; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb += 4; + } +} + +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = STATIC_CAST( + uint8_t, (b * scale >> 16) * interval_size + interval_offset); + dst_argb[1] = STATIC_CAST( + uint8_t, (g * scale >> 16) * interval_size + interval_offset); + dst_argb[2] = STATIC_CAST( + uint8_t, (r * scale >> 16) * interval_size + interval_offset); + dst_argb += 4; + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v* f >> 24 + +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + const uint32_t b_scale = REPEAT8(value & 0xff); + const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); + const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); + const uint32_t a_scale = REPEAT8(value >> 24); + + int i; + for (i = 0; i < width; ++i) { + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v* f >> 16 + +void ARGBMultiplyRow_C(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); + const uint32_t b_scale = src_argb1[0]; + const uint32_t g_scale = src_argb1[1]; + const uint32_t r_scale = src_argb1[2]; + const uint32_t a_scale = src_argb1[3]; + dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_scale)); + dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_scale)); + dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_scale)); + dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_scale)); + src_argb += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define SHADE(f, v) clamp255(v + f) + +void ARGBAddRow_C(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; + const int b_add = src_argb1[0]; + const int g_add = src_argb1[1]; + const int r_add = src_argb1[2]; + const int a_add = src_argb1[3]; + dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_add)); + dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_add)); + dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_add)); + dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_add)); + src_argb += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +#define SHADE(f, v) clamp0(f - v) + +void ARGBSubtractRow_C(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; + const int b_sub = src_argb1[0]; + const int g_sub = src_argb1[1]; + const int r_sub = src_argb1[2]; + const int a_sub = src_argb1[3]; + dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_sub)); + dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_sub)); + dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_sub)); + dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_sub)); + src_argb += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +// Sobel functions which mimics SSSE3. +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i]; + int b = src_y1[i]; + int c = src_y2[i]; + int a_sub = src_y0[i + 2]; + int b_sub = src_y1[i + 2]; + int c_sub = src_y2[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobelx[i] = (uint8_t)(clamp255(sobel)); + } +} + +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i + 0]; + int b = src_y0[i + 1]; + int c = src_y0[i + 2]; + int a_sub = src_y1[i + 0]; + int b_sub = src_y1[i + 1]; + int c_sub = src_y1[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobely[i] = (uint8_t)(clamp255(sobel)); + } +} + +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_argb[0] = (uint8_t)(s); + dst_argb[1] = (uint8_t)(s); + dst_argb[2] = (uint8_t)(s); + dst_argb[3] = (uint8_t)(255u); + dst_argb += 4; + } +} + +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_y[i] = (uint8_t)(s); + } +} + +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int g = clamp255(r + b); + dst_argb[0] = (uint8_t)(b); + dst_argb[1] = (uint8_t)(g); + dst_argb[2] = (uint8_t)(r); + dst_argb[3] = (uint8_t)(255u); + dst_argb += 4; + } +} + +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { + // Copy a Y to RGB. + int x; + for (x = 0; x < width; ++x) { + uint8_t y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// Macros to create SIMD specific yuv to rgb conversion constants. + +// clang-format off + +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +// Bias values include subtract 128 from U and V, bias from Y and rounding. +// For B and R bias is negative. For G bias is positive. +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \ + 0, 0}} +#else +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ + {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} +#endif + +// clang-format on + +#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ + const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ + const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); + +// TODO(fbarchard): Generate SIMD structures from float matrix. + +// BT.601 limited range YUV to RGB reference +// R = (Y - 16) * 1.164 + V * 1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 + U * 2.018 +// KR = 0.299; KB = 0.114 + +// U and V contributions to R,G,B. +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601) +#define UB 129 /* round(2.018 * 64) */ +#else +#define UB 128 /* max(128, round(2.018 * 64)) */ +#endif +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR 102 /* round(1.596 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.601 full range YUV to RGB reference (aka JPEG) +// * R = Y + V * 1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y + U * 1.77200 +// KR = 0.299; KB = 0.114 + +// U and V contributions to R,G,B. +#define UB 113 /* round(1.77200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR 90 /* round(1.40200 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.709 limited range YUV to RGB reference +// R = (Y - 16) * 1.164 + V * 1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 + U * 2.112 +// KR = 0.2126, KB = 0.0722 + +// U and V contributions to R,G,B. +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709) +#define UB 135 /* round(2.112 * 64) */ +#else +#define UB 128 /* max(128, round(2.112 * 64)) */ +#endif +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR 115 /* round(1.793 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.709 full range YUV to RGB reference +// R = Y + V * 1.5748 +// G = Y - U * 0.18732 - V * 0.46812 +// B = Y + U * 1.8556 +// KR = 0.2126, KB = 0.0722 + +// U and V contributions to R,G,B. +#define UB 119 /* round(1.8556 * 64) */ +#define UG 12 /* round(0.18732 * 64) */ +#define VG 30 /* round(0.46812 * 64) */ +#define VR 101 /* round(1.5748 * 64) */ + +// Y contribution to R,G,B. Scale and bias. (same as jpeg) +#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.2020 limited range YUV to RGB reference +// R = (Y - 16) * 1.164384 + V * 1.67867 +// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 +// B = (Y - 16) * 1.164384 + U * 2.14177 +// KR = 0.2627; KB = 0.0593 + +// U and V contributions to R,G,B. +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020) +#define UB 137 /* round(2.142 * 64) */ +#else +#define UB 128 /* max(128, round(2.142 * 64)) */ +#endif +#define UG 12 /* round(0.187326 * 64) */ +#define VG 42 /* round(0.65042 * 64) */ +#define VR 107 /* round(1.67867 * 64) */ + +// Y contribution to R,G,B. Scale and bias. +#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.2020 full range YUV to RGB reference +// R = Y + V * 1.474600 +// G = Y - U * 0.164553 - V * 0.571353 +// B = Y + U * 1.881400 +// KR = 0.2627; KB = 0.0593 + +#define UB 120 /* round(1.881400 * 64) */ +#define UG 11 /* round(0.164553 * 64) */ +#define VG 37 /* round(0.571353 * 64) */ +#define VR 94 /* round(1.474600 * 64) */ + +// Y contribution to R,G,B. Scale and bias. (same as jpeg) +#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +#undef BB +#undef BG +#undef BR + +#undef MAKEYUVCONSTANTS + +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVCoeff[0]; \ + int vr = yuvconstants->kUVCoeff[1]; \ + int ug = yuvconstants->kUVCoeff[2]; \ + int vg = yuvconstants->kUVCoeff[3]; \ + int yg = yuvconstants->kRGBCoeffBias[0]; \ + int bb = yuvconstants->kRGBCoeffBias[1]; \ + int bg = yuvconstants->kRGBCoeffBias[2]; \ + int br = yuvconstants->kRGBCoeffBias[3] + +#define CALC_RGB16 \ + int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ + int b16 = y1 + (u * ub) - bb; \ + int g16 = y1 + bg - (u * ug + v * vg); \ + int r16 = y1 + (v * vr) - br +#else +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVToB[0]; \ + int ug = yuvconstants->kUVToG[0]; \ + int vg = yuvconstants->kUVToG[1]; \ + int vr = yuvconstants->kUVToR[1]; \ + int yg = yuvconstants->kYToRgb[0]; \ + int yb = yuvconstants->kYBiasToRgb[0] + +#define CALC_RGB16 \ + int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ + int8_t ui = (int8_t)u; \ + int8_t vi = (int8_t)v; \ + ui -= 0x80; \ + vi -= 0x80; \ + int b16 = y1 + (ui * ub); \ + int g16 = y1 - (ui * ug + vi * vg); \ + int r16 = y1 + (vi * vr) +#endif + +// C reference code that mimics the YUV assembly. +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6)); + *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6)); + *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6)); +} + +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel8_16(uint8_t y, + uint8_t u, + uint8_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 10 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel10_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = (y << 6) | (y >> 4); + u = STATIC_CAST(uint8_t, clamp255(u >> 2)); + v = STATIC_CAST(uint8_t, clamp255(v >> 2)); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 12 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel12_16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = (y << 4) | (y >> 8); + u = STATIC_CAST(uint8_t, clamp255(u >> 4)); + v = STATIC_CAST(uint8_t, clamp255(v >> 4)); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + +// C reference code that mimics the YUV 10 bit assembly. +// Reads 10 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel10(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6)); + *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6)); + *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6)); +} + +// C reference code that mimics the YUV 12 bit assembly. +// Reads 12 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel12(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6)); + *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6)); + *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6)); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 8 bit. +static __inline void YuvPixel16_8(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y; + u = STATIC_CAST(uint16_t, clamp255(u >> 8)); + v = STATIC_CAST(uint16_t, clamp255(v >> 8)); + CALC_RGB16; + *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6)); + *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6)); + *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6)); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y; + u = STATIC_CAST(uint16_t, clamp255(u >> 8)); + v = STATIC_CAST(uint16_t, clamp255(v >> 8)); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + +// C reference code that mimics the YUV assembly. +// Reads 8 bit YUV and leaves result as 8 bit. +static __inline void YPixel(uint8_t y, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) + int yg = yuvconstants->kRGBCoeffBias[0]; + int ygb = yuvconstants->kRGBCoeffBias[4]; +#else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); + *g = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); + *r = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); +} + +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +void I444ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 3; // Advance 1 pixel. + } +} + +// Also used for 420 +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +// 10 bit YUV to ARGB +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void I410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +void I210AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2)); + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = STATIC_CAST(uint8_t, clamp255(src_a[1] >> 2)); + src_y += 2; + src_u += 1; + src_v += 1; + src_a += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2)); + } +} + +void I410AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2)); + src_y += 1; + src_u += 1; + src_v += 1; + src_a += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +// 12 bit YUV to ARGB +void I212ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { + uint32_t ar30; + b = b >> 4; // convert 8 bit 10.6 to 10 bit. + g = g >> 4; + r = r >> 4; + b = Clamp10(b); + g = Clamp10(g); + r = Clamp10(r); + ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; + (*(uint32_t*)rgb_buf) = ar30; +} + +// 10 bit YUV to 10 bit AR30 +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +// 12 bit YUV to 10 bit AR30 +void I212ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +// P210 has 10 bits in msb of 16 bit NV12 style layout. +void P210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5, + dst_argb + 6, yuvconstants); + dst_argb[7] = 255; + src_y += 2; + src_uv += 2; + dst_argb += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + } +} + +void P410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + src_y += 1; + src_uv += 2; + dst_argb += 4; // Advance 1 pixels. + } +} + +void P210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30 + 4, b, g, r); + src_y += 2; + src_uv += 2; + dst_ar30 += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + } +} + +void P410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + src_y += 1; + src_uv += 2; + dst_ar30 += 4; // Advance 1 pixel. + } +} + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I444AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + src_y += 1; + src_u += 1; + src_v += 1; + src_a += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = src_a[1]; + src_y += 2; + src_u += 1; + src_v += 1; + src_a += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + } +} + +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + b1 = b1 >> 4; + g1 = g1 >> 4; + r1 = r1 >> 4; + *(uint16_t*)(dst_argb4444 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000); + *(uint16_t*)(dst_argb4444 + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | 0xf000); + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb4444 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + *(uint16_t*)(dst_argb4444) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000); + } +} + +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 3; + r1 = r1 >> 3; + *(uint16_t*)(dst_argb1555 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000); + *(uint16_t*)(dst_argb1555 + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | 0x8000); + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb1555 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + *(uint16_t*)(dst_argb1555) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000); + } +} + +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint16_t*)(dst_rgb565 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + *(uint16_t*)(dst_rgb565 + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11)); + src_y += 2; + src_u += 1; + src_v += 1; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16_t*)(dst_rgb565 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + } +} + +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_uv += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_vu += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_uv += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_vu += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint16_t*)(dst_rgb565 + 0) = STATIC_CAST(uint16_t, b0) | + STATIC_CAST(uint16_t, g0 << 5) | + STATIC_CAST(uint16_t, r0 << 11); + *(uint16_t*)(dst_rgb565 + 2) = STATIC_CAST(uint16_t, b1) | + STATIC_CAST(uint16_t, g1 << 5) | + STATIC_CAST(uint16_t, r1 << 11); + src_y += 2; + src_uv += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16_t*)(dst_rgb565) = STATIC_CAST(uint16_t, b0) | + STATIC_CAST(uint16_t, g0 << 5) | + STATIC_CAST(uint16_t, r0 << 11); + } +} + +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_yuy2 += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_uyvy += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); + rgb_buf[0] = 255; + } +} + +void I400ToARGBRow_C(const uint8_t* src_y, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width; ++x) { + dst_uv[0] = src_uv[0]; + dst_uv[1] = src_uv[1]; + src_uv -= 2; + dst_uv += 2; + } +} + +void MirrorSplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { + int x; + const uint32_t* src32 = (const uint32_t*)(src); + uint32_t* dst32 = (uint32_t*)(dst); + src32 += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst32[x] = src32[0]; + dst32[x + 1] = src32[-1]; + src32 -= 2; + } + if (width & 1) { + dst32[width - 1] = src32[0]; + } +} + +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { + int x; + src_rgb24 += width * 3 - 3; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + src_rgb24 -= 3; + dst_rgb24 += 3; + } +} + +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x]; + dst_uv[1] = src_v[x]; + dst_uv[2] = src_u[x + 1]; + dst_uv[3] = src_v[x + 1]; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1]; + dst_uv[1] = src_v[width - 1]; + } +} + +void DetileRow_C(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + int x; + for (x = 0; x < width - 15; x += 16) { + memcpy(dst, src, 16); + dst += 16; + src += src_tile_stride; + } + if (width & 15) { + memcpy(dst, src, width & 15); + } +} + +void DetileRow_16_C(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + int x; + for (x = 0; x < width - 15; x += 16) { + memcpy(dst, src, 16 * sizeof(uint16_t)); + dst += 16; + src += src_tile_stride; + } + if (width & 15) { + memcpy(dst, src, (width & 15) * sizeof(uint16_t)); + } +} + +void DetileSplitUVRow_C(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + for (x = 0; x < width - 15; x += 16) { + SplitUVRow_C(src_uv, dst_u, dst_v, 8); + dst_u += 8; + dst_v += 8; + src_uv += src_tile_stride; + } + if (width & 15) { + SplitUVRow_C(src_uv, dst_u, dst_v, ((width & 15) + 1) / 2); + } +} + +void DetileToYUY2_C(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + for (int x = 0; x < width - 15; x += 16) { + for (int i = 0; i < 8; i++) { + dst_yuy2[0] = src_y[0]; + dst_yuy2[1] = src_uv[0]; + dst_yuy2[2] = src_y[1]; + dst_yuy2[3] = src_uv[1]; + dst_yuy2 += 4; + src_y += 2; + src_uv += 2; + } + src_y += src_y_tile_stride - 16; + src_uv += src_uv_tile_stride - 16; + } +} + +// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded +// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the +// block contain all of the lower 2 bits of each pixel packed together, and the +// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are +// packed into 1x4 blocks, whereas the upper bits are packed in normal raster +// order. +void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) { + for (size_t i = 0; i < size; i += 80) { + const uint8_t* src_lower_bits = src; + const uint8_t* src_upper_bits = src + 16; + + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 16; k++) { + *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 | + (uint16_t)*src_upper_bits << 8 | + (uint16_t)*src_upper_bits >> 2; + src_upper_bits++; + } + } + + src += 80; + } +} + +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_r[x] = src_rgb[0]; + dst_g[x] = src_rgb[1]; + dst_b[x] = src_rgb[2]; + src_rgb += 3; + } +} + +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_rgb[0] = src_r[x]; + dst_rgb[1] = src_g[x]; + dst_rgb[2] = src_b[x]; + dst_rgb += 3; + } +} + +void SplitARGBRow_C(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_b[x] = src_argb[0]; + dst_g[x] = src_argb[1]; + dst_r[x] = src_argb[2]; + dst_a[x] = src_argb[3]; + src_argb += 4; + } +} + +void MergeARGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_b[x]; + dst_argb[1] = src_g[x]; + dst_argb[2] = src_r[x]; + dst_argb[3] = src_a[x]; + dst_argb += 4; + } +} + +void MergeXR30Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + assert(depth >= 10); + assert(depth <= 16); + int x; + int shift = depth - 10; + uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30; + for (x = 0; x < width; ++x) { + uint32_t r = clamp1023(src_r[x] >> shift); + uint32_t g = clamp1023(src_g[x] >> shift); + uint32_t b = clamp1023(src_b[x] >> shift); + dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000; + } +} + +void MergeAR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift); + dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift); + dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift); + dst_ar64[3] = STATIC_CAST(uint16_t, ClampMax(src_a[x], max) << shift); + dst_ar64 += 4; + } +} + +void MergeARGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift)); + dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift)); + dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift)); + dst_argb[3] = STATIC_CAST(uint8_t, clamp255(src_a[x] >> shift)); + dst_argb += 4; + } +} + +void MergeXR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift); + dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift); + dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift); + dst_ar64[3] = 0xffff; + dst_ar64 += 4; + } +} + +void MergeXRGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift)); + dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift)); + dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift)); + dst_argb[3] = 0xff; + dst_argb += 4; + } +} + +void SplitXRGBRow_C(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_b[x] = src_argb[0]; + dst_g[x] = src_argb[1]; + dst_r[x] = src_argb[2]; + src_argb += 4; + } +} + +void MergeXRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_b[x]; + dst_argb[1] = src_g[x]; + dst_argb[2] = src_r[x]; + dst_argb[3] = 255; + dst_argb += 4; + } +} + +// Convert lsb formats to msb, depending on sample depth. +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + assert(depth >= 8); + assert(depth <= 16); + int x; + for (x = 0; x < width; ++x) { + dst_uv[0] = STATIC_CAST(uint16_t, src_u[x] << shift); + dst_uv[1] = STATIC_CAST(uint16_t, src_v[x] << shift); + dst_uv += 2; + } +} + +// Convert msb formats to lsb, depending on sample depth. +void SplitUVRow_16_C(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = 16 - depth; + int x; + assert(depth >= 8); + assert(depth <= 16); + for (x = 0; x < width; ++x) { + dst_u[x] = src_uv[0] >> shift; + dst_v[x] = src_uv[1] >> shift; + src_uv += 2; + } +} + +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = STATIC_CAST(uint16_t, src_y[x] * scale); + } +} + +void DivideRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int x; + assert(scale >= 256); + assert(scale <= 32768); + + for (x = 0; x < width; ++x) { + dst_y[x] = STATIC_CAST(uint8_t, C16TO8(src_y[x], scale)); + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + scale *= 0x0101; // replicates the byte. + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { + memcpy(dst, src, count); +} + +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { + memcpy(dst, src, count * 2); +} + +void SetRow_C(uint8_t* dst, uint8_t v8, int width) { + memset(dst, v8, width); +} + +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + for (x = 0; x < width; ++x) { + memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32); + } +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Filter 2 rows of YUY2 UV's (422) into UV (NV12). +void YUY2ToNVUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_uv += 2; + } +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; + } +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; + } + if (width & 1) { + dst_y[width - 1] = src_uyvy[1]; + } +} + +#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) + +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_C(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; + dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a)); + dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a)); + dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a)); + dst_argb[3] = 255u; + + fb = src_argb[4 + 0]; + fg = src_argb[4 + 1]; + fr = src_argb[4 + 2]; + a = src_argb[4 + 3]; + bb = src_argb1[4 + 0]; + bg = src_argb1[4 + 1]; + br = src_argb1[4 + 2]; + dst_argb[4 + 0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a)); + dst_argb[4 + 1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a)); + dst_argb[4 + 2] = STATIC_CAST(uint8_t, BLEND(fr, br, a)); + dst_argb[4 + 3] = 255u; + src_argb += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; + dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a)); + dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a)); + dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a)); + dst_argb[3] = 255u; + } +} +#undef BLEND + +#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst[0] = UBLEND(src0[0], src1[0], alpha[0]); + dst[1] = UBLEND(src0[1], src1[1], alpha[1]); + src0 += 2; + src1 += 2; + alpha += 2; + dst += 2; + } + if (width & 1) { + dst[0] = UBLEND(src0[0], src1[0], alpha[0]); + } +} +#undef UBLEND + +#if LIBYUV_ATTENUATE_DUP +// This code mimics the SSSE3 version for better testability. +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 +#else +#define ATTENUATE(f, a) (f * a + 128) >> 8 +#endif + +// Multiply source RGB by alpha and store to destination. +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + uint32_t a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = STATIC_CAST(uint8_t, a); + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = ATTENUATE(b, a); + dst_argb[5] = ATTENUATE(g, a); + dst_argb[6] = ATTENUATE(r, a); + dst_argb[7] = STATIC_CAST(uint8_t, a); + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32_t b = src_argb[0]; + const uint32_t g = src_argb[1]; + const uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = STATIC_CAST(uint8_t, a); + } +} +#undef ATTENUATE + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. +#define T(a) 0x01000000 + (0x10000 / a) +const uint32_t fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), + T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), + T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), + T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), + T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), + T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), + T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), + T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), + T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), + T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), + T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), + T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), + T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), + T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), + T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), + T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), + T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), + T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), + T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), + T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), + T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), + T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), + T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), + T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), + T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), + T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), + T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), + T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), + T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), + T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), + T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), + T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), + T(0xfc), T(0xfd), T(0xfe), 0x01000100}; +#undef T + +#if LIBYUV_UNATTENUATE_DUP +// This code mimics the Intel SIMD version for better testability. +#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16) +#else +#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8) +#endif + +// mimics the Intel SIMD code for exactness. +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + + // Clamping should not be necessary but is free in assembly. + dst_argb[0] = STATIC_CAST(uint8_t, UNATTENUATE(b, ia)); + dst_argb[1] = STATIC_CAST(uint8_t, UNATTENUATE(g, ia)); + dst_argb[2] = STATIC_CAST(uint8_t, UNATTENUATE(r, ia)); + dst_argb[3] = STATIC_CAST(uint8_t, a); + src_argb += 4; + dst_argb += 4; + } +} + +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + int32_t row_sum[4] = {0, 0, 0, 0}; + int x; + for (x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count) { + float ooa; + int i; + assert(area != 0); + + ooa = 1.0f / STATIC_CAST(float, area); + for (i = 0; i < count; ++i) { + dst[0] = + (uint8_t)(STATIC_CAST(float, bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * + ooa); + dst[1] = + (uint8_t)(STATIC_CAST(float, bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * + ooa); + dst[2] = + (uint8_t)(STATIC_CAST(float, bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * + ooa); + dst[3] = + (uint8_t)(STATIC_CAST(float, bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * + ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + +// Copy pixels from rotated source to destination row with a slope. +LIBYUV_API +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { + int i; + // Render a row of pixels from source into a buffer. + float uv[2]; + uv[0] = uv_dudv[0]; + uv[1] = uv_dudv[1]; + for (i = 0; i < width; ++i) { + int x = (int)(uv[0]); + int y = (int)(uv[1]); + *(uint32_t*)(dst_argb) = + *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); + dst_argb += 4; + uv[0] += uv_dudv[2]; + uv[1] += uv_dudv[3]; + } +} + +// Blend 2 rows into 1. +static void HalfRow_C(const uint8_t* src_uv, + ptrdiff_t src_uv_stride, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +static void HalfRow_16_C(const uint16_t* src_uv, + ptrdiff_t src_uv_stride, + uint16_t* dst_uv, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +static void HalfRow_16To8_C(const uint16_t* src_uv, + ptrdiff_t src_uv_stride, + uint8_t* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = STATIC_CAST( + uint8_t, + C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale)); + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (y1_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (y1_fraction == 128) { + HalfRow_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width; ++x) { + dst_ptr[0] = STATIC_CAST( + uint8_t, + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8); + ++src_ptr; + ++src_ptr1; + ++dst_ptr; + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (y1_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (y1_fraction == 128) { + HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width; ++x) { + dst_ptr[0] = STATIC_CAST( + uint16_t, + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8); + ++src_ptr; + ++src_ptr1; + ++dst_ptr; + } +} + +// C version 2x2 16 bit-> 2x1 8 bit. +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits + +void InterpolateRow_16To8_C(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (source_y_fraction == 0) { + Convert16To8Row_C(src_ptr, dst_ptr, scale, width); + return; + } + if (source_y_fraction == 128) { + HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width); + return; + } + for (x = 0; x < width; ++x) { + dst_ptr[0] = STATIC_CAST( + uint8_t, + C16TO8( + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8, + scale)); + src_ptr += 1; + src_ptr1 += 1; + dst_ptr += 1; + } +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int index0 = shuffler[0]; + int index1 = shuffler[1]; + int index2 = shuffler[2]; + int index3 = shuffler[3]; + // Shuffle a row of ARGB. + int x; + for (x = 0; x < width; ++x) { + // To support in-place conversion. + uint8_t b = src_argb[index0]; + uint8_t g = src_argb[index1]; + uint8_t r = src_argb[index2]; + uint8_t a = src_argb[index3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = 0; + dst_frame[3] = src_v[0]; + } +} + +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = 0; + } +} + +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + int i; + for (i = 0; i < width; ++i) { + float b = (float)(src_argb[0]); + float g = (float)(src_argb[1]); + float r = (float)(src_argb[2]); + float a = (float)(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = STATIC_CAST(uint8_t, Clamp((int32_t)(db))); + dst_argb[1] = STATIC_CAST(uint8_t, Clamp((int32_t)(dg))); + dst_argb[2] = STATIC_CAST(uint8_t, Clamp((int32_t)(dr))); + dst_argb[3] = STATIC_CAST(uint8_t, Clamp((int32_t)(da))); + src_argb += 4; + dst_argb += 4; + } +} + +// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor +// adjust the source integer range to the half float range desired. + +// This magic constant is 2^-112. Multiplying by this +// is the same as subtracting 112 from the exponent, which +// is the difference in exponent bias between 32-bit and +// 16-bit floats. Once we've done this subtraction, we can +// simply extract the low bits of the exponent and the high +// bits of the mantissa from our float and we're done. + +// Work around GCC 7 punning warning -Wstrict-aliasing +#if defined(__GNUC__) +typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; +#else +typedef uint32_t uint32_alias_t; +#endif + +void HalfFloatRow_C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + float mult = 1.9259299444e-34f * scale; + for (i = 0; i < width; ++i) { + float value = src[i] * mult; + dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); + } +} + +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float value = src[i] * scale; + dst[i] = value; + } +} + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + uint32_t bc = lumacoeff & 0xff; + uint32_t gc = (lumacoeff >> 8) & 0xff; + uint32_t rc = (lumacoeff >> 16) & 0xff; + + int i; + for (i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; + const uint8_t* luma1; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + luma1 = + ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst_a[0] = src_argb[3]; + dst_a[1] = src_argb[7]; + dst_a += 2; + src_argb += 8; + } + if (width & 1) { + dst_a[0] = src_argb[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} + +// Maximum temporary width for wrappers to process at a time, in pixels. +#define MAXTWIDTH 2048 + +#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ + defined(HAS_I422TORGB565ROW_SSSE3) +// row_win.cc has asm version, but GCC uses 2 step wrapper. +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_SSSE3) +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_SSSE3) +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_SSSE3) +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_SSSE3) +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_SSSE3) +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_AVX2) +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_AVX2) +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB565ROW_AVX2) +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB565ROW_AVX2) + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); +#else + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_AVX2) +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); +#else + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_AVX2) +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) + ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); +#else + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB24ROW_AVX2) +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I444TORGB24ROW_AVX2) +void I444ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_u += twidth; + src_v += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_AVX2) +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB565ROW_AVX2) + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); +#else + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#ifdef HAS_RGB24TOYJROW_AVX2 +// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RGB24TOYJROW_AVX2 + +#ifdef HAS_RAWTOYJROW_AVX2 +// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RAWTOYJROW_AVX2 + +#ifdef HAS_RGB24TOYJROW_SSSE3 +// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RGB24TOYJROW_SSSE3 + +#ifdef HAS_RAWTOYJROW_SSSE3 +// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RAWTOYJROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + // Row buffer for intermediate 16 bit pixels. + SIMD_ALIGNED(uint16_t row[MAXTWIDTH]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction); + Convert16To8Row_AVX2(row, dst_ptr, scale, twidth); + src_ptr += twidth; + dst_ptr += twidth; + width -= twidth; + } +} +#endif // HAS_INTERPOLATEROW_16TO8_AVX2 + +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { + float fsum = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + fsum += v * v; + *dst++ = v * scale; + } + return fsum; +} + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + float vs = v * scale; + fmax = (v > fmax) ? v : fmax; + *dst++ = vs; + } + return fmax; +} + +void ScaleSamples_C(const float* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src++ * scale; + } +} + +void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = STATIC_CAST( + uint16_t, + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8); + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_C(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + +void GaussRow_F32_C(const float* src, float* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * + (1.0f / 256.0f); + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_C(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_yuv24[0] = src_vu[0]; // V + dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[2] = src_y[0]; // Y0 + dst_yuv24[3] = src_vu[0]; // V + dst_yuv24[4] = src_vu[1]; // U + dst_yuv24[5] = src_y[1]; // Y1 + src_y += 2; + src_vu += 2; + dst_yuv24 += 6; // Advance 2 pixels. + } + if (width & 1) { + dst_yuv24[0] = src_vu[0]; // V + dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[2] = src_y[0]; // Y0 + } +} + +// Filter 2 rows of AYUV UV's (444) into UV (420). +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. +void AYUVToUVRow_C(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2x2 rows of AYUV. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 5] + 2) >> + 2; + dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 4] + 2) >> + 2; + src_ayuv += 8; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; + dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; + } +} + +// Filter 2 rows of AYUV UV's (444) into VU (420). +void AYUVToVURow_C(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + // Output a row of VU values, filtering 2x2 rows of AYUV. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 4] + 2) >> + 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 5] + 2) >> + 2; + src_ayuv += 8; + dst_vu += 2; + } + if (width & 1) { + dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; + dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; + } +} + +// Copy row of AYUV Y's into Y +void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_ayuv[2]; // v,u,y,a + src_ayuv += 4; + } +} + +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t u = src_uv[0]; + uint8_t v = src_uv[1]; + dst_vu[0] = v; + dst_vu[1] = u; + src_uv += 2; + dst_vu += 2; + } +} + +void HalfMergeUVRow_C(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] + + src_u[src_stride_u + 1] + 2) >> + 2; + dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] + + src_v[src_stride_v + 1] + 2) >> + 2; + src_u += 2; + src_v += 2; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1; + dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1; + } +} + +#undef STATIC_CAST + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc new file mode 100644 index 00000000..e94fd04d --- /dev/null +++ b/source/row_gcc.cc @@ -0,0 +1,9731 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +// Constants for ARGB +static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, + 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; + +// JPeg full range. +static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, + 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; + +static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u, + 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u}; + +static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, + 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; + +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; + +static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0, + -43, -84, 127, 0, -43, -84, 127, 0}; + +static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0}; + +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; + +static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0, + 127, -107, -20, 0, 127, -107, -20, 0}; + +// Constants for BGRA +static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, + 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; + +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; + +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; + +// Constants for ABGR +static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, + 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; + +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; + +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; + +// Constants for RGBA. +static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, + 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; + +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; + +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; + +static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, + 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; + +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; + +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +#ifdef HAS_RGB24TOARGBROW_SSSE3 + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; + +// Shuffle table for converting RAW to RGBA. +static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u, + 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u}; + +// Shuffle table for converting RAW to RGB24. First 8. +static const uvec8 kShuffleMaskRAWToRGB24_0 = { + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Middle 8. +static const uvec8 kShuffleMaskRAWToRGB24_1 = { + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleMaskRAWToRGB24_2 = { + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; + +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, +}; +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#ifdef HAS_J400TOARGBROW_SSE2 +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_J400TOARGBROW_SSE2 + +#ifdef HAS_RGB24TOARGBROW_SSSE3 +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +// Same code as RAWToARGB with different shuffler and A in low bits +void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff + "psrld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgba), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGBA) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGB24_0), // %3 + "m"(kShuffleMaskRAWToRGB24_1), // %4 + "m"(kShuffleMaskRAWToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTORGB24ROW_AVX2 +// vpermd for 12+12 to 24 +static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; + +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI +// Shuffle table for converting ARGBToRGB24 +static const ulvec8 kPermARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, + 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, + 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; +static const ulvec8 kPermARGBToRGB24_1 = { + 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, + 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, + 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; +static const ulvec8 kPermARGBToRGB24_2 = { + 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, + 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, + 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; + +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kPermARGBToRGB24_0), // %3 + "m"(kPermARGBToRGB24_1), // %4 + "m"(kPermARGBToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORAWROW_AVX2 +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + uint32_t dither4, + int width) { + asm volatile( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + uint32_t dither4, + int width) { + asm volatile( + "vbroadcastss %3,%%xmm6 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); +} + +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_RGB24TOARGBROW_SSSE3 + +/* + +ARGBToAR30Row: + +Red Blue +With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will +produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats +wanted for the blue channel. The red needs to be shifted 4 left, so multiply by +(1024+4)*16 for red. + +Alpha Green +Alpha and Green are already in the high bits so vpand can zero out the other +bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier +could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha +would be a simple multiplier to shift it into position. It wants a gap of 10 +above the green. Green is 10 bits, so there are 6 bits in the low short. 4 +more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, +and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the +result left 10 to position the A and G channels. +*/ + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, + 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; + +static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, + 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; + +static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; +static const uint32_t kMaskRB10 = 0x3ff003ff; +static const uint32_t kMaskAG10 = 0xc000ff00; +static const uint32_t kMulAG10 = 64 * 65536 + 1028; + +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_ABGRTOAR30ROW_AVX2 +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, + 6, 6, 5, 5, 4, 4, 7, 7}; +static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, + 14, 14, 13, 13, 12, 12, 15, 15}; + +void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" + "movdqa %4,%%xmm3 \n" LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pshufb %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +void ARGBToAR64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +void ARGBToAB64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" + "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_AB64TOARGBROW_AVX2 +void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +// clang-format off + +// TODO(mraptis): Consider passing R, G, B multipliers as parameter. +// round parameter is register containing value to add before shift. +#define RGBTOY(round) \ + "1: \n" \ + "movdqu (%0),%%xmm0 \n" \ + "movdqu 0x10(%0),%%xmm1 \n" \ + "movdqu 0x20(%0),%%xmm2 \n" \ + "movdqu 0x30(%0),%%xmm3 \n" \ + "psubb %%xmm5,%%xmm0 \n" \ + "psubb %%xmm5,%%xmm1 \n" \ + "psubb %%xmm5,%%xmm2 \n" \ + "psubb %%xmm5,%%xmm3 \n" \ + "movdqu %%xmm4,%%xmm6 \n" \ + "pmaddubsw %%xmm0,%%xmm6 \n" \ + "movdqu %%xmm4,%%xmm0 \n" \ + "pmaddubsw %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm4,%%xmm1 \n" \ + "pmaddubsw %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm4,%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "lea 0x40(%0),%0 \n" \ + "phaddw %%xmm0,%%xmm6 \n" \ + "phaddw %%xmm2,%%xmm1 \n" \ + "prefetcht0 1280(%0) \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ + "psrlw $0x8,%%xmm6 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm6 \n" \ + "movdqu %%xmm6,(%1) \n" \ + "lea 0x10(%1),%1 \n" \ + "sub $0x10,%2 \n" \ + "jg 1b \n" + +#define RGBTOY_AVX2(round) \ + "1: \n" \ + "vmovdqu (%0),%%ymm0 \n" \ + "vmovdqu 0x20(%0),%%ymm1 \n" \ + "vmovdqu 0x40(%0),%%ymm2 \n" \ + "vmovdqu 0x60(%0),%%ymm3 \n" \ + "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ + "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ + "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ + "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ + "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ + "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ + "lea 0x80(%0),%0 \n" \ + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ + "prefetcht0 1280(%0) \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ + "vmovdqu %%ymm0,(%1) \n" \ + "lea 0x20(%1),%1 \n" \ + "sub $0x20,%2 \n" \ + "jg 1b \n" \ + "vzeroupper \n" + +// clang-format on + +#ifdef HAS_ARGBTOYROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_ARGBTOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16. +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBTOYJROW_SSSE3 + +#ifdef HAS_ABGRTOYJROW_SSSE3 +// Convert 16 ABGR pixels (64 bytes) to 16 YJ values. +// Same as ABGRToYRow but different coefficients, no add 16. +void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ABGRTOYJROW_SSSE3 + +#ifdef HAS_RGBATOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16. +void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_RGBATOYJROW_SSSE3 + +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ + defined(HAS_ARGBEXTRACTALPHAROW_AVX2) +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm7) "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ABGRTOYROW_AVX2 +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm7) "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm5) "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kSub128), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOYJROW_AVX2 + +#ifdef HAS_ABGRTOYJROW_AVX2 +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm5) "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToYJ), // %3 + "m"(kSub128), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOYJROW_AVX2 + +#ifdef HAS_RGBATOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm5) "vzeroupper \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToYJ), // %3 + "m"(kSub128), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_RGBATOYJROW_AVX2 + +#ifdef HAS_ARGBTOUVROW_SSSE3 +void ARGBToUVRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ARGBTOUVROW_SSSE3 + +#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ + defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +#endif + +#if defined(HAS_ARGBTOUVROW_AVX2) +void ARGBToUVRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ABGRTOUVROW_AVX2 +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kAddUV128), // %5 + "m"(kABGRToV), // %6 + "m"(kABGRToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOUVROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_AVX2 +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kSub128), // %5 + "m"(kARGBToVJ), // %6 + "m"(kARGBToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOUVJROW_AVX2 + +// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix +#ifdef HAS_ABGRTOUVJROW_AVX2 +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kSub128), // %5 + "m"(kABGRToVJ), // %6 + "m"(kABGRToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOUVJROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_SSSE3 +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kSub128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ARGBTOUVJROW_SSSE3 + +#ifdef HAS_ABGRTOUVJROW_SSSE3 +void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToVJ), // %5 + "m"(kABGRToUJ), // %6 + "m"(kSub128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ABGRTOUVJROW_SSSE3 + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); +} +#endif // HAS_ARGBTOUV444ROW_SSSE3 + +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kBGRAToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) + +// Read 8 UV from 444 +#define READYUV444 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ + "movd (%[u_buf]),%%xmm3 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422 10 bit, upsample to 8 UV +#define READYUV210 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $2,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +#define READYUVA210 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $2,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 444 10 bit +#define READYUV410 \ + "movdqu (%[u_buf]),%%xmm3 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm3 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm3,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm3 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 444 10 bit. With 8 Alpha. +#define READYUVA410 \ + "movdqu (%[u_buf]),%%xmm3 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm3 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm3,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm3 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 4 UV from 422 12 bit, upsample to 8 UV +#define READYUV212 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $0x4,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $4,%%xmm4 \n" \ + "psrlw $8,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + "movd (%[u_buf]),%%xmm3 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 444. With 8 Alpha. +#define READYUVA444 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" + +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq (%[uv_buf]),%%xmm3 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 VU from NV21, upsample to 8 UV +#define READNV21 \ + "movq (%[vu_buf]),%%xmm3 \n" \ + "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ + "pshufb %[kShuffleNV21], %%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. +#define READYUY2 \ + "movdqu (%[yuy2_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu (%[yuy2_buf]),%%xmm3 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \ + "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" + +// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. +#define READUYVY \ + "movdqu (%[uyvy_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu (%[uyvy_buf]),%%xmm3 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \ + "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" + +// Read 4 UV from P210, upsample to 8 UV +#define READP210 \ + "movdqu (%[uv_buf]),%%xmm3 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410 \ + "movdqu (%[uv_buf]),%%xmm3 \n" \ + "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm3 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +#if defined(__x86_64__) +#define YUVTORGB_SETUP(yuvconstants) \ + "pcmpeqb %%xmm13,%%xmm13 \n" \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "pxor %%xmm12,%%xmm12 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "psllw $7,%%xmm13 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "pshufb %%xmm12,%%xmm13 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm12 \n" + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB16(yuvconstants) \ + "psubb %%xmm13,%%xmm3 \n" \ + "pmulhuw %%xmm11,%%xmm4 \n" \ + "movdqa %%xmm8,%%xmm0 \n" \ + "movdqa %%xmm9,%%xmm1 \n" \ + "movdqa %%xmm10,%%xmm2 \n" \ + "paddw %%xmm12,%%xmm4 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + +#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", + +#else +#define YUVTORGB_SETUP(yuvconstants) +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB16(yuvconstants) \ + "pcmpeqb %%xmm0,%%xmm0 \n" \ + "pxor %%xmm1,%%xmm1 \n" \ + "psllw $7,%%xmm0 \n" \ + "pshufb %%xmm1,%%xmm0 \n" \ + "psubb %%xmm0,%%xmm3 \n" \ + "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \ + "movdqa (%[yuvconstants]),%%xmm0 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm1 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm3 \n" \ + "paddw %%xmm3,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + +#define YUVTORGB_REGS +#endif + +#define YUVTORGB(yuvconstants) \ + YUVTORGB16(yuvconstants) \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +// Store 8 ARGB values. +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0,(%[dst_argb]) \n" \ + "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ + "lea 0x20(%[dst_argb]), %[dst_argb] \n" + +// Store 8 RGBA values. +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5,(%[dst_rgba]) \n" \ + "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ + "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" + +// Store 8 RGB24 values. +#define STORERGB24 \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm2,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "pshufb %%xmm5,%%xmm0 \n" \ + "pshufb %%xmm6,%%xmm1 \n" \ + "palignr $0xc,%%xmm0,%%xmm1 \n" \ + "movq %%xmm0,(%[dst_rgb24]) \n" \ + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" \ + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + +// Store 8 AR30 values. +#define STOREAR30 \ + "psraw $0x4,%%xmm0 \n" \ + "psraw $0x4,%%xmm1 \n" \ + "psraw $0x4,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ + "psllw $0x4,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm3 \n" \ + "movdqa %%xmm1,%%xmm2 \n" \ + "punpcklwd %%xmm5,%%xmm1 \n" \ + "punpckhwd %%xmm5,%%xmm2 \n" \ + "pslld $0xa,%%xmm1 \n" \ + "pslld $0xa,%%xmm2 \n" \ + "por %%xmm1,%%xmm0 \n" \ + "por %%xmm2,%%xmm3 \n" \ + "movdqu %%xmm0,(%[dst_ar30]) \n" \ + "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ + "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#ifdef HAS_I444ALPHATOARGBROW_SSSE3 +void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA444 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I444ALPHATOARGBROW_SSSE3 + +void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(yuvconstants) + STORERGB24 + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB(yuvconstants) + STORERGB24 + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +// 12 bit YUV to ARGB +void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +// 10 bit YUV to AR30 +void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 12 bit YUV to AR30 +void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA210 + YUVTORGB(yuvconstants) STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA410 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); + // clang-format on +} +#endif + +// 10 bit YUV to AR30 +void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA422 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I422ALPHATOARGBROW_SSSE3 + +void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READNV12 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READNV21 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [vu_buf]"+r"(vu_buf), // %[vu_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21]"m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUY2 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READUYVY + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN "1: \n" READP210 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN "1: \n" READP410 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(yuvconstants) + STORERGBA + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +// Read 16 UV from 444 +#define READYUV444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + "vmovq (%[u_buf]),%%xmm3 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +#define READYUV422_AVX512BW \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \ + "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \ + "vpermq $0xd8,%%zmm3,%%zmm3 \n" \ + "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \ + "vmovdqu8 (%[y_buf]),%%ymm4 \n" \ + "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \ + "vpermq $0xd8,%%zmm4,%%zmm4 \n" \ + "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210, upsample to 16 UV +// TODO(fbarchard): Consider vshufb to replace pack/unpack +// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. +#define READYUV210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. +#define READYUVA210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + +// Read 16 UV from 410 +#define READYUV410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 212 12 bit, upsample to 16 UV +#define READYUV212_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $0x4,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $4,%%ymm4,%%ymm2 \n" \ + "vpsrlw $8,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 16 UV from 410. With 16 Alpha. +#define READYUVA410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + +// Read 16 UV from 444. With 16 Alpha. +#define READYUVA444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 \ + "vmovq (%[u_buf]),%%xmm3 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 \ + "vmovdqu (%[uv_buf]),%%xmm3 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 VU from NV21, upsample to 16 UV. +#define READNV21_AVX2 \ + "vmovdqu (%[vu_buf]),%%xmm3 \n" \ + "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from P210, upsample to 8 UV +#define READP210_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm3 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm3 \n" \ + "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \ + "lea 0x40(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. +#define READYUY2_AVX2 \ + "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \ + "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" + +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. +#define READUYVY_AVX2 \ + "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ + "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" + +// TODO(fbarchard): Remove broadcastb +#if defined(__x86_64__) +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vpbroadcastb %%xmm13,%%ymm13 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" + +#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "vpbroadcastq %%xmm8, %%zmm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vpbroadcastb %%xmm13,%%zmm13 \n" \ + "movq 32(%[yuvconstants]),%%xmm9 \n" \ + "vpbroadcastq %%xmm9,%%zmm9 \n" \ + "movq 64(%[yuvconstants]),%%xmm10 \n" \ + "vpbroadcastq %%xmm10,%%zmm10 \n" \ + "movq 96(%[yuvconstants]),%%xmm11 \n" \ + "vpbroadcastq %%xmm11,%%zmm11 \n" \ + "movq 128(%[yuvconstants]),%%xmm12 \n" \ + "vpbroadcastq %%xmm12,%%zmm12 \n" \ + "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \ + "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \ + "vmovdqu8 (%[unperm]),%%zmm18 \n" + +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ + "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ + "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \ + "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + +#define YUVTORGB16_AVX512BW(yuvconstants) \ + "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \ + "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \ + "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \ + "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \ + "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \ + "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \ + "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \ + "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \ + "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n" + +#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", +#define YUVTORGB_REGS_AVX512BW \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18", + +#else // Convert 16 pixels: 16 UV and 16 Y. + +#define YUVTORGB_SETUP_AVX2(yuvconstants) +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \ + "vpsllw $7,%%xmm0,%%xmm0 \n" \ + "vpbroadcastb %%xmm0,%%ymm0 \n" \ + "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \ + "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm0 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \ + "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + +#define YUVTORGB_REGS_AVX2 +#endif + +#define YUVTORGB_AVX2(yuvconstants) \ + YUVTORGB16_AVX2(yuvconstants) \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + +#define YUVTORGB_AVX512BW(yuvconstants) \ + YUVTORGB16_AVX512BW(yuvconstants) \ + "vpsraw $0x6,%%zmm0,%%zmm0 \n" \ + "vpsraw $0x6,%%zmm1,%%zmm1 \n" \ + "vpsraw $0x6,%%zmm2,%%zmm2 \n" \ + "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \ + "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \ + "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n" + +// Store 16 ARGB values. +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vmovdqu %%ymm1,(%[dst_argb]) \n" \ + "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ + "lea 0x40(%[dst_argb]), %[dst_argb] \n" + +// Store 32 ARGB values. +#define STOREARGB_AVX512BW \ + "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \ + "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \ + "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \ + "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \ + "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \ + "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \ + "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \ + "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \ + "lea 0x80(%[dst_argb]), %[dst_argb] \n" + +// Store 16 AR30 values. +#define STOREAR30_AVX2 \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm2,%%ymm2 \n" \ + "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ + "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ + "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ + "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" + +#ifdef HAS_I444TOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV444_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I444TOARGBROW_AVX2 + +#if defined(HAS_I422TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX2 + +#if defined(HAS_I422TOARGBROW_AVX512BW) +static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2}; +static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4}; +static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128 +// bytes). +void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX512BW(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" + "vpbroadcastq %%xmm5,%%zmm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX512BW + YUVTORGB_AVX512BW(yuvconstants) + STOREARGB_AVX512BW + "sub $0x20,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm] + [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm] + [unperm]"r"(kUnpermuteAVX512) // %[unperm] + : "memory", "cc", YUVTORGB_REGS_AVX512BW + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX512BW + +#if defined(HAS_I422TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I422TOAR30ROW_AVX2 + +#if defined(HAS_I210TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I212TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I212TOARGBROW_AVX2 + +#if defined(HAS_I210TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I210TOAR30ROW_AVX2 + +#if defined(HAS_I212TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I212TOAR30ROW_AVX2 + +#if defined(HAS_I410TOARGBROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I210ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA210_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I410ALPHATOARGBROW_AVX2) +// 16 pixels +// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA410_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I410TOAR30ROW_AVX2 + +#if defined(HAS_I444ALPHATOARGBROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y and 16 A producing 16 ARGB. +void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA444_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I444ALPHATOARGBROW_AVX2 + +#if defined(HAS_I422ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I422ALPHATOARGBROW_AVX2 + +#if defined(HAS_I422TORGBAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(yuvconstants) + + // Step 3: Weave into RGBA + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0,(%[dst_argb]) \n" + "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" + "lea 0x40(%[dst_argb]),%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TORGBAROW_AVX2 + +#if defined(HAS_NV12TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READNV12_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_NV12TOARGBROW_AVX2 + +#if defined(HAS_NV21TOARGBROW_AVX2) +// 16 pixels. +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READNV21_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [vu_buf]"+r"(vu_buf), // %[vu_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21]"m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_NV21TOARGBROW_AVX2 + +#if defined(HAS_YUY2TOARGBROW_AVX2) +// 16 pixels. +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUY2_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_YUY2TOARGBROW_AVX2 + +#if defined(HAS_UYVYTOARGBROW_AVX2) +// 16 pixels. +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READUYVY_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_UYVYTOARGBROW_AVX2 + +#if defined(HAS_P210TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P210TOARGBROW_AVX2 + +#if defined(HAS_P410TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P410TOARGBROW_AVX2 + +#if defined(HAS_P210TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P210TOAR30ROW_AVX2 + +#if defined(HAS_P410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P410TOAR30ROW_AVX2 + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 + "pslld $0x18,%%xmm4 \n" + + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "paddsw %%xmm3,%%xmm0 \n" + "psraw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_I400TOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 + "vpslld $0x18,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsraw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_I400TOARGBROW_AVX2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_MIRRORUVROW_AVX2 +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_AVX2 + +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV) // %4 + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_MIRRORSPLITUVROW_SSSE3 + +#ifdef HAS_RGB24MIRRORROW_SSSE3 + +// Shuffle first 5 pixels to last 5 mirrored. first byte zero +static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, + 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; + +// Shuffle last 5 pixels to first 5 mirrored. last byte zero +static const uvec8 kShuffleMirrorRGB1 = { + 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; + +// Shuffle 5 pixels at a time (15 bytes) +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + intptr_t temp_width = (intptr_t)(width); + src_rgb24 += width * 3 - 48; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // first 5 + "movdqu 15(%0),%%xmm1 \n" // next 5 + "movdqu 30(%0),%%xmm2 \n" // next 5 + "movdqu 32(%0),%%xmm3 \n" // last 1 special + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm4,%%xmm2 \n" + "pshufb %%xmm5,%%xmm3 \n" + "lea -0x30(%0),%0 \n" + "movdqu %%xmm0,32(%1) \n" // last 5 + "movdqu %%xmm1,17(%1) \n" // next 5 + "movdqu %%xmm2,2(%1) \n" // next 5 + "movlpd %%xmm3,0(%1) \n" // first 1 + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorRGB0), // %3 + "m"(kShuffleMirrorRGB1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_RGB24MIRRORROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 + +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "lea -0x10(%0,%2,4),%0 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); +} +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vmovdqu %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_AVX2 +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_DETILEROW_SSE2 +void DetileRow_SSE2(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" + "sub $0x10,%2 \n" + "lea (%0,%3),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0"); +} +#endif // HAS_DETILEROW_SSE2 + +#ifdef HAS_DETILEROW_16_SSE2 +void DetileRow_16_SSE2(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0", "xmm1"); +} +#endif // HAS_DETILEROW_SSE2 + +#ifdef HAS_DETILEROW_16_AVX +void DetileRow_16_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea (%0,%3,2),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0"); +} +#endif // HAS_DETILEROW_AVX + +#ifdef HAS_DETILETOYUY2_SSE2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" // Load 16 Y + "sub $0x10,%3 \n" + "lea (%0,%4),%0 \n" + "movdqu (%1),%%xmm1 \n" // Load 8 UV + "lea (%1,%5),%1 \n" + "movdqu %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list + ); +} +#endif + +#ifdef HAS_DETILESPLITUVROW_SSSE3 +// TODO(greenjustin): Look into generating these constants instead of loading +// them since this can cause branch mispredicts for fPIC code on 32-bit +// machines. +static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14, + 1, 3, 5, 7, 9, 11, 13, 15}; + +// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very +// slow on older SSE2 processors. +void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqu %4,%%xmm1 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea (%0, %5),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "movhps %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "m"(kDeinterlaceUV), // %4 + "r"(src_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1"); +} +#endif // HAS_DETILESPLITUVROW_SSSE3 + +#ifdef HAS_MERGEUVROW_AVX512BW +void MergeUVRow_AVX512BW(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile("sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%0),%%zmm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" + "lea 0x20(%0),%0 \n" + "vpsllw $0x8,%%zmm1,%%zmm1 \n" + "vporq %%zmm0,%%zmm1,%%zmm2 \n" + "vmovdqu64 %%zmm2,(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_AVX512BW + +#ifdef HAS_MERGEUVROW_AVX2 +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile("sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%0),%%ymm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile("sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vmovd %5,%%xmm4 \n" + + + "sub %0,%1 \n" + // 8 pixels per loop. + + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm0 \n" + "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" + "vpslld %%xmm4,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(16 - depth), // %4 + "r"(32 - depth) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_SPLITUVROW_16_AVX2 +const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15}; +void SplitUVRow_16_AVX2(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + depth = 16 - depth; + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "sub %1,%2 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + + "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x0,%%ymm1,0x10(%1) \n" + "vextractf128 $0x1,%%ymm0,(%1,%2) \n" + "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" + "add $0x20,%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(depth), // %4 + "m"(kSplitUVShuffle16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + // clang-format on +} +#endif // HAS_SPLITUVROW_16_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + +// Use scale to convert msb formats to lsb, depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// 65536 = 16 bits +#ifdef HAS_DIVIDEROW_16_AVX2 +void DivideRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width), // %2 + "+r"(scale) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT16TO8ROW_AVX2 +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT16TO8ROW_AVX2 + +// Use scale to convert to lsb formats depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT8TO16ROW_AVX2 +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT8TO16ROW_AVX2 + +#ifdef HAS_SPLITRGBROW_SSSE3 +// Shuffle table for converting RGB to Planar. +static const uvec8 kSplitRGBShuffle[9] = { + {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u, + 7u, 10u, 13u}, + {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, + 8u, 11u, 14u}, + {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, + 12u, 15u}}; + +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb 0(%5), %%xmm0 \n" + "pshufb 16(%5), %%xmm1 \n" + "pshufb 32(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb 48(%5),%%xmm0 \n" + "pshufb 64(%5),%%xmm1 \n" + "pshufb 80(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb 96(%5), %%xmm0 \n" + "pshufb 112(%5), %%xmm1 \n" + "pshufb 128(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(&kSplitRGBShuffle[0]) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_SPLITRGBROW_SSSE3 + +#ifdef HAS_MERGERGBROW_SSSE3 +// Shuffle table for converting Planar to RGB. +static const uvec8 kMergeRGBShuffle[9] = { + {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, + 128u, 5u}, + {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, + 128u, 128u}, + {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u}, + {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u, + 10u, 128u}, + {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, + 128u, 10u}, + {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u}, + {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, + 15u, 128u, 128u}, + {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u}, + {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u}}; + +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb (%5), %%xmm0 \n" + "pshufb 16(%5), %%xmm1 \n" + "pshufb 32(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb 48(%5), %%xmm0 \n" + "pshufb 64(%5), %%xmm1 \n" + "pshufb 80(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb 96(%5), %%xmm0 \n" + "pshufb 112(%5), %%xmm1 \n" + "pshufb 128(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : "r"(&kMergeRGBShuffle[0]) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGERGBROW_SSSE3 + +#ifdef HAS_MERGEARGBROW_SSE2 +void MergeARGBRow_SSE2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + + LABELALIGN + "1: \n" + + "movq (%0,%2),%%xmm0 \n" // B + "movq (%0),%%xmm1 \n" // R + "movq (%0,%1),%%xmm2 \n" // G + "punpcklbw %%xmm1,%%xmm0 \n" // BR + "movq (%0,%3),%%xmm1 \n" // A + "punpcklbw %%xmm1,%%xmm2 \n" // GA + "movdqa %%xmm0,%%xmm1 \n" // BR + "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) + "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) + "movdqu %%xmm0,(%4) \n" + "movdqu %%xmm1,16(%4) \n" + + "lea 8(%0),%0 \n" + "lea 32(%4),%4 \n" + "sub $0x8,%5 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_MERGEXRGBROW_SSE2 +void MergeXRGBRow_SSE2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + + "movq (%2),%%xmm0 \n" // B + "movq (%0),%%xmm1 \n" // R + "movq (%1),%%xmm2 \n" // G + "punpcklbw %%xmm1,%%xmm0 \n" // BR + "pcmpeqd %%xmm1,%%xmm1 \n" // A(255) + "punpcklbw %%xmm1,%%xmm2 \n" // GA + "movdqa %%xmm0,%%xmm1 \n" // BR + "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) + "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,16(%3) \n" + + "lea 8(%0),%0 \n" + "lea 8(%1),%1 \n" + "lea 8(%2),%2 \n" + "lea 32(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEARGBROW_SSE2 + +#ifdef HAS_MERGEARGBROW_AVX2 +void MergeARGBRow_AVX2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + + LABELALIGN + "1: \n" + + "vmovdqu (%0,%2),%%xmm0 \n" // B + "vmovdqu (%0,%1),%%xmm1 \n" // R + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G + "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A + "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%4) \n" // First 8 + "vmovdqu %%ymm1,32(%4) \n" // Next 8 + + "lea 16(%0),%0 \n" + "lea 64(%4),%4 \n" + "sub $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_MERGEXRGBROW_AVX2 +void MergeXRGBRow_AVX2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + + "vmovdqu (%2),%%xmm0 \n" // B + "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255) + "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G + "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3) \n" // First 8 + "vmovdqu %%ymm1,32(%3) \n" // Next 8 + + "lea 16(%0),%0 \n" + "lea 16(%1),%1 \n" + "lea 16(%2),%2 \n" + "lea 64(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEARGBROW_AVX2 + +#ifdef HAS_SPLITARGBROW_SSE2 +void SplitARGBRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + + "sub %1,%2 \n" + "sub %1,%3 \n" + "sub %1,%4 \n" + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 + "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B + "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%1,%3) \n" // B + "movhps %%xmm0,(%1,%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + "movhps %%xmm2,(%1,%4) \n" // A + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "sub $0x8,%5 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+rm"(width) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_SPLITXRGBROW_SSE2 +void SplitXRGBRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 + "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B + "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%3) \n" // B + "movhps %%xmm0,(%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "lea 8(%2),%2 \n" + "lea 8(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; +#ifdef HAS_SPLITARGBROW_SSSE3 +void SplitARGBRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + + "movdqa %6,%%xmm3 \n" + "sub %1,%2 \n" + "sub %1,%3 \n" + "sub %1,%4 \n" + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) + "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%1,%3) \n" // B + "movhps %%xmm0,(%1,%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + "movhps %%xmm2,(%1,%4) \n" // A + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "subl $0x8,%5 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(kShuffleMaskARGBSplit) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} +#endif + +#ifdef HAS_SPLITXRGBROW_SSSE3 +void SplitXRGBRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + "movdqa %5,%%xmm3 \n" + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) + "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%3) \n" // B + "movhps %%xmm0,(%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "lea 8(%2),%2 \n" + "lea 8(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskARGBSplit) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} +#endif + +#ifdef HAS_SPLITARGBROW_AVX2 +static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; +void SplitARGBRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + + "sub %1,%2 \n" + "sub %1,%3 \n" + "sub %1,%4 \n" + "vmovdqa %7,%%ymm3 \n" + "vbroadcastf128 %6,%%ymm4 \n" + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00-0F + "vmovdqu 16(%0),%%xmm1 \n" // 10-1F + "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F + "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermd %%ymm0,%%ymm3,%%ymm0 \n" + "vpermd %%ymm1,%%ymm3,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR + "vmovdqu %%xmm0,(%1,%3) \n" // B + "vextracti128 $1,%%ymm0,(%1) \n" // R + "vmovdqu %%xmm2,(%1,%2) \n" // G + "vextracti128 $1,%%ymm2,(%1,%4) \n" // A + "lea 64(%0),%0 \n" + "lea 16(%1),%1 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(kShuffleMaskARGBSplit), // %6 + "m"(kShuffleMaskARGBPermute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SPLITXRGBROW_AVX2 +void SplitXRGBRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + "vmovdqa %6,%%ymm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00-0F + "vmovdqu 16(%0),%%xmm1 \n" // 10-1F + "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F + "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermd %%ymm0,%%ymm3,%%ymm0 \n" + "vpermd %%ymm1,%%ymm3,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR + "vmovdqu %%xmm0,(%3) \n" // B + "vextracti128 $1,%%ymm0,(%1) \n" // R + "vmovdqu %%xmm2,(%2) \n" // G + + "lea 64(%0),%0 \n" + "lea 16(%1),%1 \n" + "lea 16(%2),%2 \n" + "lea 16(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskARGBSplit), // %5 + "m"(kShuffleMaskARGBPermute) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_MERGEXR30ROW_AVX2 +void MergeXR30Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = depth - 10; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vmovd %5,%%xmm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1),%%ymm1 \n" + "vmovdqu (%0,%2),%%ymm2 \n" + "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n" + "vpminuw %%ymm0,%%ymm6,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm6,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm6,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit + "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB + "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" + "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit + "vpslld $0xa,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + "vmovdqu %%ymm0,(%3) \n" + "vmovdqu %%ymm3,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 +#if defined(__i386__) + : "m"(shift) // %5 +#else + : "rm"(shift) // %5 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_MERGEAR64ROW_AVX2 +static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7}; +void MergeAR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vmovdqa %8,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + "vbroadcastss %7,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpminuw %%ymm3,%%ymm7,%%ymm3 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpermd %%ymm3,%%ymm5,%%ymm3 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%4) \n" + "vmovdqu %%ymm2,0x20(%4) \n" + "vmovdqu %%ymm4,0x40(%4) \n" + "vmovdqu %%ymm1,0x60(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(shift), // %6 + "m"(mask), // %7 + "m"(MergeAR64Permute) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +void MergeXR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vmovdqa %7,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vbroadcastss %6,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff) + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%3) \n" + "vmovdqu %%ymm2,0x20(%3) \n" + "vmovdqu %%ymm4,0x40(%3) \n" + "vmovdqu %%ymm1,0x60(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "m"(shift), // %5 + "m"(mask), // %6 + "m"(MergeAR64Permute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; +void MergeARGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vbroadcastf128 %7,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%4) \n" + "vmovdqu %%ymm0,0x20(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(shift), // %6 + "m"(MergeARGB16To8Shuffle) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vbroadcastf128 %6,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%3) \n" + "vmovdqu %%ymm0,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "m"(shift), // %5 + "m"(MergeARGB16To8Shuffle) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + + LABELALIGN + "2: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_COPYROW_AVX + +#ifdef HAS_COPYROW_ERMS +// Multiple of 1. +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile( + + "rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); +} +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +// width in pixels +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +static const uvec8 kShuffleAlphaShort_AVX2 = { + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; + +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { + size_t width_tmp = (size_t)(width >> 2); + const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. + asm volatile( + + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { + size_t width_tmp = (size_t)(width); + asm volatile( + + "rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); +} + +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile( + + "rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_YUY2TOYROW_AVX2 +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1"); +} + +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; + +// Blend 8 pixels at a time +void ARGBBlendRow_SSSE3(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_SSSE3 +// Blend 8 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); +} +#endif // HAS_BLENDPLANEROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 32 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_BLENDPLANEROW_AVX2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; +// Attenuate 4 pixels at a time. +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + uintptr_t alpha; + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; +// Unattenuate 8 pixels at a time. +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + uintptr_t alpha; + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER + + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBUNATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psubb %%xmm5,%%xmm0 \n" + "psubb %%xmm5,%%xmm1 \n" + "movdqu %%xmm4,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "movdqu %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "phaddw %%xmm0,%%xmm6 \n" + "paddw %%xmm5,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm6,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm6,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; + +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; + +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_SSE2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_AVX2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. +void ARGBSubtractRow_SSE2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_AVX2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop. + LABELALIGN + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count) { + asm volatile( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop. + LABELALIGN + "4: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" + + // 4 pixel loop + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop + LABELALIGN + "10: \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width) { + intptr_t src_argb_stride_temp = src_argb_stride; + intptr_t temp; + asm volatile( + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "=&r"(temp) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 100b \n" + + "99: \n" + "vzeroupper \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); +} +#endif // HAS_INTERPOLATEROW_AVX2 + +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "movdqu (%3),%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_ARGBSHUFFLEROW_SSSE3 + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "vbroadcastf128 (%3),%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +#ifdef HAS_I422TOYUY2ROW_SSE2 +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_SSE2 + +#ifdef HAS_I422TOUYVYROW_SSE2 +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOUYVYROW_SSE2 + +#ifdef HAS_I422TOYUY2ROW_AVX2 +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_AVX2 + +#ifdef HAS_I422TOUYVYROW_AVX2 +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOUYVYROW_AVX2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + asm volatile( + + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + asm volatile( + "vbroadcastf128 (%3),%%ymm4 \n" + "vbroadcastf128 0x10(%3),%%ymm5 \n" + "vbroadcastf128 0x20(%3),%%ymm6 \n" + "vbroadcastf128 0x30(%3),%%ymm7 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels + "lea 0x8(%0),%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * + // X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "vmovq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_HALFFLOATROW_SSE2 +static float kScaleBias = 1.9259299444e-34f; +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "movd %3,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" // 8 shorts + "add $0x10,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,-0x10(%0,%1,1) \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(scale) // %3 + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { + asm volatile( + "sub %0,%1 \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm2", "xmm3"); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + uintptr_t pixel_temp; + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + uintptr_t pixel_temp; + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + uintptr_t pixel_temp; + uintptr_t table_temp; + asm volatile( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" + : "=&d"(pixel_temp), // %0 + "=&a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +static const uvec8 kYUV24Shuffle[3] = { + {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12}, + {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15}, + {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}}; + +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "movdqa (%4),%%xmm4 \n" // 3 shuffler constants + "movdqa 16(%4),%%xmm5 \n" + "movdqa 32(%4),%%xmm6 \n" + "1: \n" + "movdqu (%0),%%xmm2 \n" // load 16 Y values + "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values + "lea 16(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3 + "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5 + "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7 + "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24 + "pshufb %%xmm5, %%xmm1 \n" + "pshufb %%xmm6, %%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm1,16(%2) \n" + "movdqu %%xmm2,32(%2) \n" + "lea 48(%2),%2 \n" + "sub $16,%3 \n" // 16 pixels per loop + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants + "vbroadcastf128 16(%4),%%ymm5 \n" + "vbroadcastf128 32(%4),%%ymm6 \n" + + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // load 32 Y values + "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values + "lea 32(%0),%0 \n" + "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3 + "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5 + "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7 + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n" + "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm3,(%2) \n" + "vmovdqu %%ymm0,32(%2) \n" + "vmovdqu %%ymm1,64(%2) \n" + "lea 96(%2),%2 \n" + "sub $32,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_NV21ToYUV24ROW_AVX512 +// The following VMBI VEX256 code tests okay with the intelsde emulator. +static const lvec8 kYUV24Perm[3] = { + {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36, + 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43}, + {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15, + 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52}, + {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59, + 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}}; + +void NV21ToYUV24Row_AVX512(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants + "vmovdqa 32(%4),%%ymm5 \n" + "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // load 32 Y values + "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values + "lea 32(%0),%0 \n" + "vmovdqa %%ymm2, %%ymm0 \n" + "vmovdqa %%ymm2, %%ymm1 \n" + "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n" + "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n" + "vmovdqu %%ymm0,(%2) \n" + "vmovdqu %%ymm1,32(%2) \n" + "vmovdqu %%ymm2,64(%2) \n" + "lea 96(%2),%2 \n" + "sub $32,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Perm[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif // HAS_NV21ToYUV24ROW_AVX512 + +#ifdef HAS_SWAPUVROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; + +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + + "movdqu %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_SWAPUVROW_SSSE3 + +#ifdef HAS_SWAPUVROW_AVX2 +void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_SWAPUVROW_AVX2 + +void HalfMergeUVRow_SSSE3(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // load 16 U values + "movdqu (%1),%%xmm1 \n" // load 16 V values + "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row + "movdqu 0(%1,%5,1),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // half size + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" // store 8 UV pixels + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" // 16 src pixels per loop + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void HalfMergeUVRow_AVX2(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // load 32 U values + "vmovdqu (%1),%%ymm1 \n" // load 32 V values + "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row + "vmovdqu 0(%1,%5,1),%%ymm3 \n" + "lea 0x20(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels + "lea 0x20(%2),%2 \n" + "sub $0x20,%3 \n" // 32 src pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { + asm volatile( + "pxor %%xmm1,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" // load float + "maxss %%xmm1, %%xmm0 \n" // clamp to zero + "add 4, %0 \n" + "movd %%xmm0, (%1) \n" // store float + "add 4, %1 \n" + "sub $0x4,%2 \n" // 1 float per loop + "jg 1b \n" + : "+r"(src_x), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_lasx.cc b/source/row_lasx.cc new file mode 100644 index 00000000..1082ad80 --- /dev/null +++ b/source/row_lasx.cc @@ -0,0 +1,2304 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ + { \ + ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ + vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ + ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ + vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ + yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ + yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ + } + +// Load 32 YUV422 pixel data +#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ + { \ + __m256i temp0, temp1; \ + \ + DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ + temp1 = __lasx_xvld(psrc_v, 0); \ + temp0 = __lasx_xvsub_b(temp0, const_0x80); \ + temp1 = __lasx_xvsub_b(temp1, const_0x80); \ + temp0 = __lasx_vext2xv_h_b(temp0); \ + temp1 = __lasx_vext2xv_h_b(temp1); \ + uv_l = __lasx_xvilvl_h(temp0, temp1); \ + uv_h = __lasx_xvilvh_h(temp0, temp1); \ + } + +// Load 16 YUV422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ + { \ + __m256i temp0, temp1; \ + \ + out_y = __lasx_xvld(psrc_y, 0); \ + temp0 = __lasx_xvldrepl_d(psrc_u, 0); \ + temp1 = __lasx_xvldrepl_d(psrc_v, 0); \ + uv = __lasx_xvilvl_b(temp0, temp1); \ + uv = __lasx_xvsub_b(uv, const_0x80); \ + uv = __lasx_vext2xv_h_b(uv); \ + } + +// Convert 16 pixels of YUV420 to RGB. +#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \ + g_h, r_l, r_h) \ + { \ + __m256i u_l, u_h, v_l, v_h; \ + __m256i yl_ev, yl_od, yh_ev, yh_od; \ + __m256i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lasx_xvilvl_b(in_y, in_y); \ + temp1 = __lasx_xvilvh_b(in_y, in_y); \ + yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \ + yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \ + yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \ + yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \ + DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ + yl_ev, yl_od, yh_ev, yh_od); \ + yl_ev = __lasx_xvadd_w(yl_ev, yb); \ + yl_od = __lasx_xvadd_w(yl_od, yb); \ + yh_ev = __lasx_xvadd_w(yh_ev, yb); \ + yh_od = __lasx_xvadd_w(yh_od, yb); \ + v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \ + u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \ + v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \ + u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \ + temp0 = __lasx_xvadd_w(yl_ev, u_l); \ + temp1 = __lasx_xvadd_w(yl_od, u_l); \ + temp2 = __lasx_xvadd_w(yh_ev, u_h); \ + temp3 = __lasx_xvadd_w(yh_od, u_h); \ + DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + b_l = __lasx_xvpackev_h(temp1, temp0); \ + b_h = __lasx_xvpackev_h(temp3, temp2); \ + temp0 = __lasx_xvadd_w(yl_ev, v_l); \ + temp1 = __lasx_xvadd_w(yl_od, v_l); \ + temp2 = __lasx_xvadd_w(yh_ev, v_h); \ + temp3 = __lasx_xvadd_w(yh_od, v_h); \ + DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + r_l = __lasx_xvpackev_h(temp1, temp0); \ + r_h = __lasx_xvpackev_h(temp3, temp2); \ + DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ + temp0 = __lasx_xvsub_w(yl_ev, u_l); \ + temp1 = __lasx_xvsub_w(yl_od, u_l); \ + temp2 = __lasx_xvsub_w(yh_ev, u_h); \ + temp3 = __lasx_xvsub_w(yh_od, u_h); \ + DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + g_l = __lasx_xvpackev_h(temp1, temp0); \ + g_h = __lasx_xvpackev_h(temp3, temp2); \ + } + +// Convert 8 pixels of YUV420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ + { \ + __m256i u_l, v_l, yl_ev, yl_od; \ + __m256i temp0, temp1; \ + \ + in_y = __lasx_xvpermi_d(in_y, 0xD8); \ + temp0 = __lasx_xvilvl_b(in_y, in_y); \ + yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \ + yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \ + DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \ + yl_ev = __lasx_xvadd_w(yl_ev, yb); \ + yl_od = __lasx_xvadd_w(yl_od, yb); \ + v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \ + u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \ + temp0 = __lasx_xvadd_w(yl_ev, u_l); \ + temp1 = __lasx_xvadd_w(yl_od, u_l); \ + DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ + DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ + out_b = __lasx_xvpackev_h(temp1, temp0); \ + temp0 = __lasx_xvadd_w(yl_ev, v_l); \ + temp1 = __lasx_xvadd_w(yl_od, v_l); \ + DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ + DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ + out_r = __lasx_xvpackev_h(temp1, temp0); \ + u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \ + temp0 = __lasx_xvsub_w(yl_ev, u_l); \ + temp1 = __lasx_xvsub_w(yl_od, u_l); \ + DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ + DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ + out_g = __lasx_xvpackev_h(temp1, temp0); \ + } + +// Pack and Store 16 ARGB values. +#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ + { \ + __m256i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lasx_xvpackev_b(g_l, b_l); \ + temp1 = __lasx_xvpackev_b(a_l, r_l); \ + temp2 = __lasx_xvpackev_b(g_h, b_h); \ + temp3 = __lasx_xvpackev_b(a_h, r_h); \ + r_l = __lasx_xvilvl_h(temp1, temp0); \ + r_h = __lasx_xvilvh_h(temp1, temp0); \ + g_l = __lasx_xvilvl_h(temp3, temp2); \ + g_h = __lasx_xvilvh_h(temp3, temp2); \ + temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \ + temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \ + temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \ + temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \ + __lasx_xvst(temp0, pdst_argb, 0); \ + __lasx_xvst(temp1, pdst_argb, 32); \ + __lasx_xvst(temp2, pdst_argb, 64); \ + __lasx_xvst(temp3, pdst_argb, 96); \ + pdst_argb += 128; \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ + { \ + __m256i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lasx_xvpackev_b(in_g, in_b); \ + temp1 = __lasx_xvpackev_b(in_a, in_r); \ + temp2 = __lasx_xvilvl_h(temp1, temp0); \ + temp3 = __lasx_xvilvh_h(temp1, temp0); \ + temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \ + temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \ + __lasx_xvst(temp0, pdst_argb, 0); \ + __lasx_xvst(temp1, pdst_argb, 32); \ + pdst_argb += 64; \ + } + +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \ + _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \ + } + +void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 64; + __m256i src0, src1; + __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607, + 0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 64; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); + DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + src0 = __lasx_xvpermi_q(src0, src0, 0x01); + src1 = __lasx_xvpermi_q(src1, src1, 0x01); + __lasx_xvst(src1, dst, 0); + __lasx_xvst(src0, dst, 32); + dst += 64; + src -= 64; + } +} + +void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 16; + __m256i src, dst; + __m256i shuffler = {0x0004000500060007, 0x0000000100020003, + 0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 16) << 1; + for (x = 0; x < len; x++) { + src = __lasx_xvld(src_uv, 0); + dst = __lasx_xvshuf_h(shuffler, src, src); + dst = __lasx_xvpermi_q(dst, dst, 0x01); + __lasx_xvst(dst, dst_uv, 0); + src_uv -= 32; + dst_uv += 32; + } +} + +void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 16; + __m256i src0, src1; + __m256i dst0, dst1; + __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504, + 0x0B0A09080F0E0D0C, 0x0302010007060504}; + src += (width * 4) - 64; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); + DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + dst1 = __lasx_xvpermi_q(src0, src0, 0x01); + dst0 = __lasx_xvpermi_q(src1, src1, 0x01); + __lasx_xvst(dst0, dst, 0); + __lasx_xvst(dst1, dst, 32); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 32; + __m256i src_u0, src_v0, src_y0, vec_uv0; + __m256i vec_yuy2_0, vec_yuy2_1; + __m256i dst_yuy2_0, dst_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lasx_xvld(src_y, 0); + src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); + src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); + vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0); + dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20); + dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31); + __lasx_xvst(dst_yuy2_0, dst_yuy2, 0); + __lasx_xvst(dst_yuy2_1, dst_yuy2, 32); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 32; + __m256i src_u0, src_v0, src_y0, vec_uv0; + __m256i vec_uyvy0, vec_uyvy1; + __m256i dst_uyvy0, dst_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lasx_xvld(src_y, 0); + src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); + src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); + vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); + vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0); + dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20); + dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31); + __lasx_xvst(dst_uyvy0, dst_uyvy, 0); + __lasx_xvst(dst_uyvy1, dst_uyvy, 32); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422ToRGBARow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422AlphaToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + int res = width & 31; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i zero = __lasx_xvldi(0); + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; + + y = __lasx_xvld(src_a, 0); + a_l = __lasx_xvilvl_b(zero, y); + a_h = __lasx_xvilvh_b(zero, y); + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 32; + src_u += 16; + src_v += 16; + src_a += 32; + } + if (res) { + __m256i y, uv, r, g, b, a; + a = __lasx_xvld(src_a, 0); + a = __lasx_vext2xv_hu_bu(a); + READYUV422(src_y, src_u, src_v, y, uv); + YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); + STOREARGB(a, r, g, b, dst_argb); + } +} + +void I422ToRGB24Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614, + 0x0504120302100100, 0x0A18090816070614}; + __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B, + 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + temp0 = __lasx_xvpackev_b(g_l, b_l); + temp1 = __lasx_xvpackev_b(g_h, b_h); + DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, + r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, + temp1); + + b_l = __lasx_xvilvl_d(temp1, temp2); + b_h = __lasx_xvilvh_d(temp3, temp1); + temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20); + temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30); + temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31); + __lasx_xvst(temp1, dst_argb, 0); + __lasx_xvst(temp2, dst_argb, 32); + __lasx_xvst(temp3, dst_argb, 64); + dst_argb += 96; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 3); + b_h = __lasx_xvsrli_h(b_h, 3); + g_l = __lasx_xvsrli_h(g_l, 2); + g_h = __lasx_xvsrli_h(g_h, 2); + r_l = __lasx_xvsrli_h(r_l, 3); + r_h = __lasx_xvsrli_h(r_h, 3); + r_l = __lasx_xvslli_h(r_l, 11); + r_h = __lasx_xvslli_h(r_h, 11); + g_l = __lasx_xvslli_h(g_l, 5); + g_h = __lasx_xvslli_h(g_h, 5); + r_l = __lasx_xvor_v(r_l, g_l); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_rgb565, 0); + __lasx_xvst(dst_h, dst_rgb565, 32); + dst_rgb565 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, 0xF000F000F000F000, + 0xF000F000F000F000}; + __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, + 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 4); + b_h = __lasx_xvsrli_h(b_h, 4); + r_l = __lasx_xvsrli_h(r_l, 4); + r_h = __lasx_xvsrli_h(r_h, 4); + g_l = __lasx_xvand_v(g_l, mask); + g_h = __lasx_xvand_v(g_h, mask); + r_l = __lasx_xvslli_h(r_l, 8); + r_h = __lasx_xvslli_h(r_h, 8); + r_l = __lasx_xvor_v(r_l, alpha); + r_h = __lasx_xvor_v(r_h, alpha); + r_l = __lasx_xvor_v(r_l, g_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_argb4444, 0); + __lasx_xvst(dst_h, dst_argb4444, 32); + dst_argb4444 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422ToARGB1555Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000, + 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 3); + b_h = __lasx_xvsrli_h(b_h, 3); + g_l = __lasx_xvsrli_h(g_l, 3); + g_h = __lasx_xvsrli_h(g_h, 3); + g_l = __lasx_xvslli_h(g_l, 5); + g_h = __lasx_xvslli_h(g_h, 5); + r_l = __lasx_xvsrli_h(r_l, 3); + r_h = __lasx_xvsrli_h(r_h, 3); + r_l = __lasx_xvslli_h(r_l, 10); + r_h = __lasx_xvslli_h(r_h, 10); + r_l = __lasx_xvor_v(r_l, alpha); + r_h = __lasx_xvor_v(r_h, alpha); + r_l = __lasx_xvor_v(r_l, g_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_argb1555, 0); + __lasx_xvst(dst_h, dst_argb1555, 32); + dst_argb1555 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); + dst0 = __lasx_xvpickev_b(src1, src0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0, + src_yuy2_next, 32, src0, src1, src2, src3); + src0 = __lasx_xvpickod_b(src1, src0); + src1 = __lasx_xvpickod_b(src3, src2); + tmp0 = __lasx_xvavgr_bu(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); + tmp0 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); + dst0 = __lasx_xvpickod_b(src1, src0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_LASX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0, + src_uyvy_next, 32, src0, src1, src2, src3); + src0 = __lasx_xvpickev_b(src1, src0); + src1 = __lasx_xvpickev_b(src3, src2); + tmp0 = __lasx_xvavgr_bu(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToUVRow_LASX(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* src_argb1 = src_argb0 + src_stride_argb; + + __m256i src0, src1, src2, src3, src4, src5, src6, src7; + __m256i vec0, vec1, vec2, vec3; + __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; + __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038, + 0x0038003800380038, 0x0038003800380038}; + __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025, + 0x0025002500250025, 0x0025002500250025}; + __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013, + 0x0013001300130013, 0x0013001300130013}; + __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f, + 0x002f002f002f002f, 0x002f002f002f002f}; + __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009, + 0x0009000900090009, 0x0009000900090009}; + __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; + __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, + src_argb0, 96, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64, + src_argb1, 96, src4, src5, src6, src7); + vec0 = __lasx_xvaddwev_h_bu(src0, src4); + vec1 = __lasx_xvaddwev_h_bu(src1, src5); + vec2 = __lasx_xvaddwev_h_bu(src2, src6); + vec3 = __lasx_xvaddwev_h_bu(src3, src7); + tmp0 = __lasx_xvpickev_h(vec1, vec0); + tmp1 = __lasx_xvpickev_h(vec3, vec2); + tmp2 = __lasx_xvpickod_h(vec1, vec0); + tmp3 = __lasx_xvpickod_h(vec3, vec2); + vec0 = __lasx_xvaddwod_h_bu(src0, src4); + vec1 = __lasx_xvaddwod_h_bu(src1, src5); + vec2 = __lasx_xvaddwod_h_bu(src2, src6); + vec3 = __lasx_xvaddwod_h_bu(src3, src7); + tmp4 = __lasx_xvpickev_h(vec1, vec0); + tmp5 = __lasx_xvpickev_h(vec3, vec2); + vec0 = __lasx_xvpickev_h(tmp1, tmp0); + vec1 = __lasx_xvpickod_h(tmp1, tmp0); + src0 = __lasx_xvavgr_h(vec0, vec1); + vec0 = __lasx_xvpickev_h(tmp3, tmp2); + vec1 = __lasx_xvpickod_h(tmp3, tmp2); + src1 = __lasx_xvavgr_h(vec0, vec1); + vec0 = __lasx_xvpickev_h(tmp5, tmp4); + vec1 = __lasx_xvpickod_h(tmp5, tmp4); + src2 = __lasx_xvavgr_h(vec0, vec1); + dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70); + dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A); + dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26); + dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70); + dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E); + dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12); + dst0 = __lasx_xvperm_w(dst0, control); + dst1 = __lasx_xvperm_w(dst1, control); + dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8); + dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_argb0 += 128; + src_argb1 += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 32) - 1; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100, + 0x000000000E0D0C0A}; + __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005, + 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + __lasx_xvst(tmp3, dst_rgb, 72); + dst_rgb += 96; + src_argb += 128; + } + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96, + src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + dst_rgb += 72; + __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); + __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); + __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); +} + +void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 32) - 1; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102, + 0x000000000C0D0E08}; + __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005, + 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + __lasx_xvst(tmp3, dst_rgb, 72); + dst_rgb += 96; + src_argb += 128; + } + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96, + src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + dst_rgb += 72; + __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); + __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); + __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); +} + +void ARGBToRGB565Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, tmp0, tmp1, dst0; + __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300, + 0x0300030003000300}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvsrli_b(tmp0, 3); + tmp1 = __lasx_xvpackev_b(zero, tmp1); + tmp1 = __lasx_xvsrli_h(tmp1, 2); + tmp0 = __lasx_xvsll_b(tmp0, shift); + tmp1 = __lasx_xvslli_h(tmp1, 5); + dst0 = __lasx_xvor_v(tmp0, tmp1); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToARGB1555Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; + __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703, + 0x0703070307030703}; + __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200, + 0x0200020002000200}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvsrli_b(tmp0, 3); + tmp1 = __lasx_xvsrl_b(tmp1, shift1); + tmp0 = __lasx_xvsll_b(tmp0, shift2); + tmp2 = __lasx_xvpackev_b(zero, tmp1); + tmp3 = __lasx_xvpackod_b(zero, tmp1); + tmp2 = __lasx_xvslli_h(tmp2, 5); + tmp3 = __lasx_xvslli_h(tmp3, 15); + dst0 = __lasx_xvor_v(tmp0, tmp2); + dst0 = __lasx_xvor_v(dst0, tmp3); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp1 = __lasx_xvandi_b(tmp1, 0xF0); + tmp0 = __lasx_xvsrli_b(tmp0, 4); + dst0 = __lasx_xvor_v(tmp1, tmp0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToUV444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, reg3, dst0, dst1; + __m256i const_112 = __lasx_xvldi(112); + __m256i const_74 = __lasx_xvldi(74); + __m256i const_38 = __lasx_xvldi(38); + __m256i const_94 = __lasx_xvldi(94); + __m256i const_18 = __lasx_xvldi(18); + __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvpickev_h(src1, src0); + tmp1 = __lasx_xvpickod_h(src1, src0); + tmp2 = __lasx_xvpickev_h(src3, src2); + tmp3 = __lasx_xvpickod_h(src3, src2); + reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112); + reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112); + reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74); + reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74); + reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38); + reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38); + reg0 = __lasx_xvsub_h(reg0, reg2); + reg1 = __lasx_xvsub_h(reg1, reg3); + dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8); + dst0 = __lasx_xvperm_w(dst0, control); + reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112); + reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112); + reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18); + reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18); + reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94); + reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94); + reg0 = __lasx_xvsub_h(reg0, reg2); + reg1 = __lasx_xvsub_h(reg1, reg3); + dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8); + dst1 = __lasx_xvperm_w(dst1, control); + __lasx_xvst(dst0, dst_u, 0); + __lasx_xvst(dst1, dst_v, 0); + dst_u += 32; + dst_v += 32; + src_argb += 128; + } +} + +void ARGBMultiplyRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, dst0, dst1; + __m256i tmp0, tmp1, tmp2, tmp3; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + tmp0 = __lasx_xvilvl_b(src0, src0); + tmp1 = __lasx_xvilvh_b(src0, src0); + tmp2 = __lasx_xvilvl_b(zero, src1); + tmp3 = __lasx_xvilvh_b(zero, src1); + dst0 = __lasx_xvmuh_hu(tmp0, tmp2); + dst1 = __lasx_xvmuh_hu(tmp1, tmp3); + dst0 = __lasx_xvpickev_b(dst1, dst0); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAddRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lasx_xvsadd_bu(src0, src1); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lasx_xvssub_bu(src0, src1); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, reg2, reg3, reg4, reg5; + __m256i b, g, r, a, dst0, dst1; + __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000, + 0x0007000300060002}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + b = __lasx_xvpackev_b(tmp0, tmp0); + r = __lasx_xvpackod_b(tmp0, tmp0); + g = __lasx_xvpackev_b(tmp1, tmp1); + a = __lasx_xvpackod_b(tmp1, tmp1); + reg0 = __lasx_xvmulwev_w_hu(b, a); + reg1 = __lasx_xvmulwod_w_hu(b, a); + reg2 = __lasx_xvmulwev_w_hu(r, a); + reg3 = __lasx_xvmulwod_w_hu(r, a); + reg4 = __lasx_xvmulwev_w_hu(g, a); + reg5 = __lasx_xvmulwod_w_hu(g, a); + reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24); + reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24); + reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24); + reg0 = __lasx_xvshuf_h(control, reg0, reg0); + reg2 = __lasx_xvshuf_h(control, reg2, reg2); + reg4 = __lasx_xvshuf_h(control, reg4, reg4); + tmp0 = __lasx_xvpackev_b(reg4, reg0); + tmp1 = __lasx_xvpackev_b(a, reg2); + dst0 = __lasx_xvilvl_h(tmp1, tmp0); + dst1 = __lasx_xvilvh_h(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + dst_argb += 64; + src_argb += 64; + } +} + +void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1, dst0; + __m256i b, g, r; + __m256i zero = __lasx_xvldi(0); + __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0); + + vec_dither = __lasx_xvilvl_b(zero, vec_dither); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + b = __lasx_xvpackev_b(zero, tmp0); + r = __lasx_xvpackod_b(zero, tmp0); + g = __lasx_xvpackev_b(zero, tmp1); + b = __lasx_xvadd_h(b, vec_dither); + g = __lasx_xvadd_h(g, vec_dither); + r = __lasx_xvadd_h(r, vec_dither); + DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g); + r = __lasx_xvclip255_h(r); + b = __lasx_xvsrai_h(b, 3); + g = __lasx_xvsrai_h(g, 2); + r = __lasx_xvsrai_h(r, 3); + g = __lasx_xvslli_h(g, 5); + r = __lasx_xvslli_h(r, 11); + dst0 = __lasx_xvor_v(b, g); + dst0 = __lasx_xvor_v(dst0, r); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + src_argb += 64; + dst_rgb += 32; + } +} + +void ARGBShuffleRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, dst0, dst1; + __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000, + 0x0C0C0C0C08080808}; + __m256i temp = __lasx_xvldrepl_w(shuffler, 0); + + shuf = __lasx_xvadd_b(shuf, temp); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + dst0 = __lasx_xvshuf_b(src0, src0, shuf); + dst1 = __lasx_xvshuf_b(src1, src1, shuf); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + src_argb += 64; + dst_argb += 64; + } +} + +void ARGBShadeRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + int len = width / 8; + __m256i src0, dst0, tmp0, tmp1; + __m256i vec_value = __lasx_xvreplgr2vr_w(value); + + vec_value = __lasx_xvilvl_b(vec_value, vec_value); + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb, 0); + tmp0 = __lasx_xvilvl_b(src0, src0); + tmp1 = __lasx_xvilvh_b(src0, src0); + tmp0 = __lasx_xvmuh_hu(tmp0, vec_value); + tmp1 = __lasx_xvmuh_hu(tmp1, vec_value); + dst0 = __lasx_xvpickod_b(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, reg2, dst0, dst1; + __m256i const_128 = __lasx_xvldi(0x480); + __m256i const_150 = __lasx_xvldi(0x96); + __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, + 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + reg0 = __lasx_xvdp2_h_bu(tmp0, const_br); + reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); + reg2 = __lasx_xvadd_h(reg0, reg1); + tmp0 = __lasx_xvpackod_b(reg2, reg2); + tmp1 = __lasx_xvpackod_b(tmp1, reg2); + dst0 = __lasx_xvilvl_h(tmp1, tmp0); + dst1 = __lasx_xvilvh_h(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + src_argb += 64; + dst_argb += 64; + } +} + +void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, spb, spg, spr; + __m256i dst0, dst1; + __m256i spb_g = __lasx_xvldi(68); + __m256i spg_g = __lasx_xvldi(88); + __m256i spr_g = __lasx_xvldi(98); + __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311, + 0x2311231123112311}; + __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16, + 0x2D162D162D162D16}; + __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218, + 0x3218321832183218}; + __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100, + 0x1F0E1D0C1B0A1908}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); + spr = __lasx_xvdp2_h_bu(tmp0, spr_br); + spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g); + spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g); + spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g); + spb = __lasx_xvsrli_h(spb, 7); + spg = __lasx_xvsrli_h(spg, 7); + spr = __lasx_xvsrli_h(spr, 7); + spg = __lasx_xvsat_hu(spg, 7); + spr = __lasx_xvsat_hu(spr, 7); + reg0 = __lasx_xvpackev_b(spg, spb); + reg1 = __lasx_xvshuf_b(tmp1, spr, shuff); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + dst_argb += 64; + } +} + +void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb4444, 0); + src1 = __lasx_xvld(src_argb4444, 32); + DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2); + DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3); + DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2); + DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2, + 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_argb4444 += 64; + dst_argb += 128; + } +} + +void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; + __m256i reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb1555, 0); + src1 = __lasx_xvld(src_argb1555, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + tmpa = __lasx_xvsrli_b(tmp1, 7); + tmpa = __lasx_xvneg_b(tmpa); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + dst2 = __lasx_xvilvl_h(reg3, reg2); + dst3 = __lasx_xvilvh_h(reg3, reg2); + DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, + 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); + __lasx_xvst(reg0, dst_argb, 0); + __lasx_xvst(reg1, dst_argb, 32); + __lasx_xvst(reg2, dst_argb, 64); + __lasx_xvst(reg3, dst_argb, 96); + src_argb1555 += 64; + dst_argb += 128; + } +} + +void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3; + __m256i alpha = __lasx_xvldi(0xFF); + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_rgb565, 0); + src1 = __lasx_xvld(src_rgb565, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvsrli_b(tmpr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst2 = __lasx_xvilvl_h(reg1, reg0); + dst3 = __lasx_xvilvh_h(reg1, reg0); + DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, + 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); + __lasx_xvst(reg0, dst_argb, 0); + __lasx_xvst(reg1, dst_argb, 32); + __lasx_xvst(reg2, dst_argb, 64); + __lasx_xvst(reg3, dst_argb, 96); + src_rgb565 += 64; + dst_argb += 128; + } +} + +void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2; + __m256i dst0, dst1, dst2, dst3; + __m256i reg0, reg1, reg2, reg3; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, + 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, + 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, + 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100, + 0x100B0A0910080706}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_rgb24, 0); + reg1 = __lasx_xvld(src_rgb24, 32); + reg2 = __lasx_xvld(src_rgb24, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, + tmp1); + tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, + 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_rgb24 += 96; + dst_argb += 128; + } +} + +void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, + 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, + 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, + 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102, + 0x10090A0B10060708}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_raw, 0); + reg1 = __lasx_xvld(src_raw, 32); + reg2 = __lasx_xvld(src_raw, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, + tmp1); + tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, + 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_raw += 96; + dst_argb += 128; + } +} + +void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, reg2, dst0; + __m256i const_66 = __lasx_xvldi(66); + __m256i const_129 = __lasx_xvldi(129); + __m256i const_25 = __lasx_xvldi(25); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb1555, 0); + src1 = __lasx_xvld(src_argb1555, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lasx_xvpackod_b(reg1, reg0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_argb1555 += 64; + dst_y += 32; + } +} + +void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i reg0, reg1, reg2, reg3, dst0; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0, + next_argb1555, 32, src0, src1, src2, src3); + DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + nexb = __lasx_xvandi_b(tmp2, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + nexg = __lasx_xvsrli_b(tmp2, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg2 = __lasx_xvandi_b(tmp3, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + reg2 = __lasx_xvslli_b(reg2, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + nexg = __lasx_xvor_v(nexg, reg2); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + reg3 = __lasx_xvandi_b(tmp3, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + nexr = __lasx_xvsrli_b(reg3, 2); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + reg0 = __lasx_xvslli_b(nexb, 3); + reg1 = __lasx_xvslli_b(nexg, 3); + reg2 = __lasx_xvslli_b(nexr, 3); + nexb = __lasx_xvsrli_b(nexb, 2); + nexg = __lasx_xvsrli_b(nexg, 2); + nexr = __lasx_xvsrli_b(nexr, 2); + nexb = __lasx_xvor_v(reg0, nexb); + nexg = __lasx_xvor_v(reg1, nexg); + nexr = __lasx_xvor_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + reg0 = __lasx_xvpermi_d(reg0, 0xD8); + reg1 = __lasx_xvpermi_d(reg1, 0xD8); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_argb1555 += 64; + next_argb1555 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, dst0; + __m256i const_66 = __lasx_xvldi(66); + __m256i const_129 = __lasx_xvldi(129); + __m256i const_25 = __lasx_xvldi(25); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_rgb565, 0); + src1 = __lasx_xvld(src_rgb565, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvsrli_b(tmpr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lasx_xvpackod_b(reg1, reg0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_rgb565 += 64; + } +} + +void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i reg0, reg1, reg2, reg3, dst0; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0, + next_rgb565, 32, src0, src1, src2, src3); + DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + nexb = __lasx_xvandi_b(tmp2, 0x1F); + nexr = __lasx_xvandi_b(tmp3, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg3 = __lasx_xvandi_b(tmp3, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + reg2 = __lasx_xvsrli_b(tmp2, 5); + reg3 = __lasx_xvslli_b(reg3, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + nexg = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + reg2 = __lasx_xvslli_b(nexb, 3); + reg3 = __lasx_xvsrli_b(nexb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + nexb = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + reg2 = __lasx_xvslli_b(nexg, 2); + reg3 = __lasx_xvsrli_b(nexg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + nexg = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvsrli_b(tmpr, 5); + reg2 = __lasx_xvsrli_b(nexr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + nexr = __lasx_xvor_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + reg0 = __lasx_xvpermi_d(reg0, 0xD8); + reg1 = __lasx_xvpermi_d(reg1, 0xD8); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + dst_u += 16; + dst_v += 16; + src_rgb565 += 64; + next_rgb565 += 64; + } +} + +void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; + int len = width / 32; + __m256i src0, src1, src2, reg0, reg1, reg2; + __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18, + 0x15120F0C09060300, 0x00000000001E1B18}; + __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908, + 0x0706050403020100, 0x1D1A1714110A0908}; + __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, + 0x1613100D0A070401, 0x00000000001F1C19}; + __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, + 0x0706050403020100, 0x1E1B1815120A0908}; + __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A, + 0x1714110E0B080502, 0x0000000000001D1A}; + __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908, + 0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64, + next_rgb24, 0, reg0, reg1, reg2, tmp0); + DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, + 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_rgb24 += 96; + next_rgb24 += 96; + dst_u += 16; + dst_v += 16; + } +} + +void RAWToUVRow_LASX(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_raw = src_raw + src_stride_raw; + int len = width / 32; + __m256i src0, src1, src2, reg0, reg1, reg2; + __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18, + 0x15120F0C09060300, 0x00000000001E1B18}; + __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908, + 0x0706050403020100, 0x1D1A1714110A0908}; + __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, + 0x1613100D0A070401, 0x00000000001F1C19}; + __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, + 0x0706050403020100, 0x1E1B1815120A0908}; + __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A, + 0x1714110E0B080502, 0x0000000000001D1A}; + __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908, + 0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0, + reg0, reg1, reg2, tmp0); + DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, + 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_raw += 96; + next_raw += 96; + dst_u += 16; + dst_v += 16; + } +} + +void NV12ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_vrub, vec_vgug, vec_y, vec_vu; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = __lasx_xvldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); + vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_vu = __lasx_xvld(src_uv, 0); + vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); + vec_vu = __lasx_vext2xv_h_b(vec_vu); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, + out_b); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_uv += 16; + } +} + +void NV12ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_vrub, vec_vgug, vec_y, vec_vu; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); + vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_vu = __lasx_xvld(src_uv, 0); + vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); + vec_vu = __lasx_vext2xv_h_b(vec_vu); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, + out_b); + out_b = __lasx_xvsrli_h(out_b, 3); + out_g = __lasx_xvsrli_h(out_g, 2); + out_r = __lasx_xvsrli_h(out_r, 3); + out_g = __lasx_xvslli_h(out_g, 5); + out_r = __lasx_xvslli_h(out_r, 11); + out_r = __lasx_xvor_v(out_r, out_g); + out_r = __lasx_xvor_v(out_r, out_b); + __lasx_xvst(out_r, dst_rgb565, 0); + src_y += 16; + src_uv += 16; + dst_rgb565 += 32; + } +} + +void NV21ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = __lasx_xvldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_uv = __lasx_xvld(src_uv, 0); + vec_uv = __lasx_xvsub_b(vec_uv, const_0x80); + vec_uv = __lasx_vext2xv_h_b(vec_uv); + YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_uv += 16; + } +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr20, %4, 0 \n\t" // load shuff + "1: \n\t" + "xvld $xr4, %0, 0 \n\t" + "xvld $xr5, %0, 32 \n\t" + "xvld $xr6, %0, 64 \n\t" + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // ARGB + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR + "xvpickev.b $xr10, $xr7, $xr6 \n\t" + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA + "xvpickod.b $xr11, $xr7, $xr6 \n\t" + "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t" + "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G + "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t" + "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t" + "addi.d %0, %0, 128 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvperm.w $xr11, $xr10, $xr20 \n\t" + "xvst $xr11, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); +} + +void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr20, %4, 0 \n\t" // load shuff + "1: \n\t" + "xvld $xr4, %0, 0 \n\t" + "xvld $xr5, %0, 32 \n\t" + "xvld $xr6, %0, 64 \n\t" + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // RGBA + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG + "xvpickev.b $xr10, $xr7, $xr6 \n\t" + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR + "xvpickod.b $xr11, $xr7, $xr6 \n\t" + "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t" + "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G + "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t" + "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t" + "addi.d %0, %0, 128 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvperm.w $xr11, $xr10, $xr20 \n\t" + "xvst $xr11, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); +} + +void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants); +} + +static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int8_t shuff[128] = { + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr4, %4, 0 \n\t" // load shuff + "xvld $xr5, %4, 32 \n\t" + "xvld $xr6, %4, 64 \n\t" + "xvld $xr7, %4, 96 \n\t" + "1: \n\t" + "xvld $xr8, %0, 0 \n\t" + "xvld $xr9, %0, 32 \n\t" + "xvld $xr10, %0, 64 \n\t" // load 32 pixels of + // RGB + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "xvor.v $xr11, $xr9, $xr9 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0 + "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1 + "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2 + "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t" + "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t" + "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t" + "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t" + "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G + "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t" + "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t" + "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t" + "addi.d %0, %0, 96 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvst $xr10, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), // %3 + "r"(shuff) // %4 + : "memory"); +} + +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants); +} + +void ARGBToUVJRow_LASX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_argb = src_argb + src_stride_argb; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i nex0, nex1, nex2, nex3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, dst0; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_63 = __lasx_xvldi(0x43F); + __m256i const_42 = __lasx_xvldi(0x42A); + __m256i const_21 = __lasx_xvldi(0x415); + __m256i const_53 = __lasx_xvldi(0x435); + __m256i const_10 = __lasx_xvldi(0x40A); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301, + 0x1F1D0F0D1B190B09}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64, + next_argb, 96, nex0, nex1, nex2, nex3); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp2 = __lasx_xvpickev_b(src3, src2); + tmp3 = __lasx_xvpickod_b(src3, src2); + tmpr = __lasx_xvpickod_b(tmp2, tmp0); + tmpb = __lasx_xvpickev_b(tmp2, tmp0); + tmpg = __lasx_xvpickev_b(tmp3, tmp1); + tmp0 = __lasx_xvpickev_b(nex1, nex0); + tmp1 = __lasx_xvpickod_b(nex1, nex0); + tmp2 = __lasx_xvpickev_b(nex3, nex2); + tmp3 = __lasx_xvpickod_b(nex3, nex2); + nexr = __lasx_xvpickod_b(tmp2, tmp0); + nexb = __lasx_xvpickev_b(tmp2, tmp0); + nexg = __lasx_xvpickev_b(tmp3, tmp1); + tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb); + tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb); + tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg); + tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg); + reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr); + reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr); + tmpb = __lasx_xvavgr_hu(tmp0, tmp1); + tmpg = __lasx_xvavgr_hu(tmp2, tmp3); + tmpr = __lasx_xvavgr_hu(reg0, reg1); + reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb); + reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr); + reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg); + reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg); + reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr); + reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb); + dst0 = __lasx_xvpackod_b(reg1, reg0); + tmp0 = __lasx_xvpermi_d(dst0, 0x44); + tmp1 = __lasx_xvpermi_d(dst0, 0xEE); + dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 2); + __lasx_xvstelm_d(dst0, dst_u, 8, 1); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + dst_u += 16; + dst_v += 16; + src_argb += 128; + next_argb += 128; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) diff --git a/source/row_lsx.cc b/source/row_lsx.cc new file mode 100644 index 00000000..e626072a --- /dev/null +++ b/source/row_lsx.cc @@ -0,0 +1,2987 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \ + { \ + ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \ + vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \ + ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \ + vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \ + yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \ + yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ + } + +// Load 32 YUV422 pixel data +#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ + { \ + __m128i temp0, temp1; \ + \ + DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ + temp1 = __lsx_vld(psrc_v, 0); \ + temp0 = __lsx_vsub_b(temp0, const_80); \ + temp1 = __lsx_vsub_b(temp1, const_80); \ + temp0 = __lsx_vsllwil_h_b(temp0, 0); \ + temp1 = __lsx_vsllwil_h_b(temp1, 0); \ + uv_l = __lsx_vilvl_h(temp0, temp1); \ + uv_h = __lsx_vilvh_h(temp0, temp1); \ + } + +// Load 16 YUV422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ + { \ + __m128i temp0, temp1; \ + \ + out_y = __lsx_vld(psrc_y, 0); \ + temp0 = __lsx_vldrepl_d(psrc_u, 0); \ + temp1 = __lsx_vldrepl_d(psrc_v, 0); \ + uv = __lsx_vilvl_b(temp0, temp1); \ + uv = __lsx_vsub_b(uv, const_80); \ + uv = __lsx_vsllwil_h_b(uv, 0); \ + } + +// Convert 16 pixels of YUV420 to RGB. +#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \ + g_h, r_l, r_h) \ + { \ + __m128i u_l, u_h, v_l, v_h; \ + __m128i yl_ev, yl_od, yh_ev, yh_od; \ + __m128i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lsx_vilvl_b(in_y, in_y); \ + temp1 = __lsx_vilvh_b(in_y, in_y); \ + yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \ + yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \ + yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \ + yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \ + DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ + yl_ev, yl_od, yh_ev, yh_od); \ + yl_ev = __lsx_vadd_w(yl_ev, yb); \ + yl_od = __lsx_vadd_w(yl_od, yb); \ + yh_ev = __lsx_vadd_w(yh_ev, yb); \ + yh_od = __lsx_vadd_w(yh_od, yb); \ + v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \ + u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \ + v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \ + u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \ + temp0 = __lsx_vadd_w(yl_ev, u_l); \ + temp1 = __lsx_vadd_w(yl_od, u_l); \ + temp2 = __lsx_vadd_w(yh_ev, u_h); \ + temp3 = __lsx_vadd_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + b_l = __lsx_vpackev_h(temp1, temp0); \ + b_h = __lsx_vpackev_h(temp3, temp2); \ + temp0 = __lsx_vadd_w(yl_ev, v_l); \ + temp1 = __lsx_vadd_w(yl_od, v_l); \ + temp2 = __lsx_vadd_w(yh_ev, v_h); \ + temp3 = __lsx_vadd_w(yh_od, v_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + r_l = __lsx_vpackev_h(temp1, temp0); \ + r_h = __lsx_vpackev_h(temp3, temp2); \ + DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ + temp0 = __lsx_vsub_w(yl_ev, u_l); \ + temp1 = __lsx_vsub_w(yl_od, u_l); \ + temp2 = __lsx_vsub_w(yh_ev, u_h); \ + temp3 = __lsx_vsub_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + g_l = __lsx_vpackev_h(temp1, temp0); \ + g_h = __lsx_vpackev_h(temp3, temp2); \ + } + +// Convert 8 pixels of YUV420 to RGB. +#define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \ + { \ + __m128i y_ev, y_od, u_l, v_l; \ + __m128i tmp0, tmp1, tmp2, tmp3; \ + \ + tmp0 = __lsx_vilvl_b(in_y, in_y); \ + y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \ + y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \ + y_ev = __lsx_vsrai_w(y_ev, 16); \ + y_od = __lsx_vsrai_w(y_od, 16); \ + y_ev = __lsx_vadd_w(y_ev, yb); \ + y_od = __lsx_vadd_w(y_od, yb); \ + in_vu = __lsx_vilvl_b(zero, in_vu); \ + in_vu = __lsx_vsub_h(in_vu, const_80); \ + u_l = __lsx_vmulwev_w_h(in_vu, vrub); \ + v_l = __lsx_vmulwod_w_h(in_vu, vrub); \ + tmp0 = __lsx_vadd_w(y_ev, u_l); \ + tmp1 = __lsx_vadd_w(y_od, u_l); \ + tmp2 = __lsx_vadd_w(y_ev, v_l); \ + tmp3 = __lsx_vadd_w(y_od, v_l); \ + tmp0 = __lsx_vsrai_w(tmp0, 6); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp2 = __lsx_vsrai_w(tmp2, 6); \ + tmp3 = __lsx_vsrai_w(tmp3, 6); \ + tmp0 = __lsx_vclip255_w(tmp0); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + tmp2 = __lsx_vclip255_w(tmp2); \ + tmp3 = __lsx_vclip255_w(tmp3); \ + out_b = __lsx_vpackev_h(tmp1, tmp0); \ + out_r = __lsx_vpackev_h(tmp3, tmp2); \ + tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \ + tmp1 = __lsx_vsub_w(y_ev, tmp0); \ + tmp2 = __lsx_vsub_w(y_od, tmp0); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp2 = __lsx_vsrai_w(tmp2, 6); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + tmp2 = __lsx_vclip255_w(tmp2); \ + out_g = __lsx_vpackev_h(tmp2, tmp1); \ + } + +// Convert I444 pixels of YUV420 to RGB. +#define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \ + out_r) \ + { \ + __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \ + __m128i tmp0, tmp1, tmp2, tmp3; \ + \ + y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \ + y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \ + y_ev = __lsx_vsrai_w(y_ev, 16); \ + y_od = __lsx_vsrai_w(y_od, 16); \ + y_ev = __lsx_vadd_w(y_ev, yb); \ + y_od = __lsx_vadd_w(y_od, yb); \ + in_u = __lsx_vsub_h(in_u, const_80); \ + in_v = __lsx_vsub_h(in_v, const_80); \ + u_ev = __lsx_vmulwev_w_h(in_u, ub); \ + u_od = __lsx_vmulwod_w_h(in_u, ub); \ + v_ev = __lsx_vmulwev_w_h(in_v, vr); \ + v_od = __lsx_vmulwod_w_h(in_v, vr); \ + tmp0 = __lsx_vadd_w(y_ev, u_ev); \ + tmp1 = __lsx_vadd_w(y_od, u_od); \ + tmp2 = __lsx_vadd_w(y_ev, v_ev); \ + tmp3 = __lsx_vadd_w(y_od, v_od); \ + tmp0 = __lsx_vsrai_w(tmp0, 6); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp2 = __lsx_vsrai_w(tmp2, 6); \ + tmp3 = __lsx_vsrai_w(tmp3, 6); \ + tmp0 = __lsx_vclip255_w(tmp0); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + tmp2 = __lsx_vclip255_w(tmp2); \ + tmp3 = __lsx_vclip255_w(tmp3); \ + out_b = __lsx_vpackev_h(tmp1, tmp0); \ + out_r = __lsx_vpackev_h(tmp3, tmp2); \ + u_ev = __lsx_vpackev_h(in_u, in_v); \ + u_od = __lsx_vpackod_h(in_u, in_v); \ + v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \ + v_od = __lsx_vdp2_w_h(u_od, ugvg); \ + tmp0 = __lsx_vsub_w(y_ev, v_ev); \ + tmp1 = __lsx_vsub_w(y_od, v_od); \ + tmp0 = __lsx_vsrai_w(tmp0, 6); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp0 = __lsx_vclip255_w(tmp0); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + out_g = __lsx_vpackev_h(tmp1, tmp0); \ + } + +// Pack and Store 16 ARGB values. +#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ + { \ + __m128i temp0, temp1, temp2, temp3; \ + temp0 = __lsx_vpackev_b(g_l, b_l); \ + temp1 = __lsx_vpackev_b(a_l, r_l); \ + temp2 = __lsx_vpackev_b(g_h, b_h); \ + temp3 = __lsx_vpackev_b(a_h, r_h); \ + r_l = __lsx_vilvl_h(temp1, temp0); \ + r_h = __lsx_vilvh_h(temp1, temp0); \ + g_l = __lsx_vilvl_h(temp3, temp2); \ + g_h = __lsx_vilvh_h(temp3, temp2); \ + __lsx_vst(r_l, pdst_argb, 0); \ + __lsx_vst(r_h, pdst_argb, 16); \ + __lsx_vst(g_l, pdst_argb, 32); \ + __lsx_vst(g_h, pdst_argb, 48); \ + pdst_argb += 64; \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ + { \ + __m128i temp0, temp1; \ + __m128i dst0, dst1; \ + \ + temp0 = __lsx_vpackev_b(in_g, in_b); \ + temp1 = __lsx_vpackev_b(in_a, in_r); \ + dst0 = __lsx_vilvl_h(temp1, temp0); \ + dst1 = __lsx_vilvh_h(temp1, temp0); \ + __lsx_vst(dst0, pdst_argb, 0); \ + __lsx_vst(dst1, pdst_argb, 16); \ + pdst_argb += 32; \ + } + +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ + __m128i _reg0, _reg1; \ + _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \ + _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \ + _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ + } + +void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 32; + __m128i src0, src1; + __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 8; + __m128i src, dst; + __m128i shuffler = {0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 8) << 1; + for (x = 0; x < len; x++) { + src = __lsx_vld(src_uv, 0); + dst = __lsx_vshuf_h(shuffler, src, src); + __lsx_vst(dst, dst_uv, 0); + src_uv -= 16; + dst_uv += 16; + } +} + +void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 8; + __m128i src0, src1; + __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504}; + + src += (width * 4) - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void I422ToYUY2Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_yuy2_0, vec_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0); + __lsx_vst(vec_yuy2_0, dst_yuy2, 0); + __lsx_vst(vec_yuy2_1, dst_yuy2, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_yuy2 += 32; + } +} + +void I422ToUYVYRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_uyvy0, vec_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0); + __lsx_vst(vec_uyvy0, dst_uyvy, 0); + __lsx_vst(vec_uyvy1, dst_uyvy, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_uyvy += 32; + } +} + +void I422ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToRGBARow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422AlphaToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + int res = width & 15; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i zero = __lsx_vldi(0); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; + + y = __lsx_vld(src_a, 0); + a_l = __lsx_vilvl_b(zero, y); + a_h = __lsx_vilvh_b(zero, y); + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + src_a += 16; + } + if (res) { + __m128i y, uv, r, g, b, a; + a = __lsx_vld(src_a, 0); + a = __lsx_vsllwil_hu_bu(a, 0); + READYUV422(src_y, src_u, src_v, y, uv); + YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); + STOREARGB(a, r, g, b, dst_argb); + } +} + +void I422ToRGB24Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614}; + __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m128i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + temp0 = __lsx_vpackev_b(g_l, b_l); + temp1 = __lsx_vpackev_b(g_h, b_h); + DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l, + temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, + temp1); + + b_l = __lsx_vilvl_d(temp1, temp2); + b_h = __lsx_vilvh_d(temp3, temp1); + __lsx_vst(temp0, dst_argb, 0); + __lsx_vst(b_l, dst_argb, 16); + __lsx_vst(b_h, dst_argb, 32); + dst_argb += 48; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 2); + g_h = __lsx_vsrli_h(g_h, 2); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 11); + r_h = __lsx_vslli_h(r_h, 11); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vor_v(r_l, g_l); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, g_h); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_rgb565, 0); + __lsx_vst(r_h, dst_rgb565, 16); + dst_rgb565 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0xF000F000F000F000, 0xF000F000F000F000}; + __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 4); + b_h = __lsx_vsrli_h(b_h, 4); + r_l = __lsx_vsrli_h(r_l, 4); + r_h = __lsx_vsrli_h(r_h, 4); + g_l = __lsx_vand_v(g_l, mask); + g_h = __lsx_vand_v(g_h, mask); + r_l = __lsx_vslli_h(r_l, 8); + r_h = __lsx_vslli_h(r_h, 8); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb4444, 0); + __lsx_vst(r_h, dst_argb4444, 16); + dst_argb4444 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToARGB1555Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0x8000800080008000, 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 3); + + g_h = __lsx_vsrli_h(g_h, 3); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 10); + r_h = __lsx_vslli_h(r_h, 10); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb1555, 0); + __lsx_vst(r_h, dst_argb1555, 16); + dst_argb1555 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + dst0 = __lsx_vpickev_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_yuy2 += 32; + dst_y += 16; + } +} + +void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0, + src_yuy2_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickod_b(src1, src0); + src1 = __lsx_vpickod_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + src_yuy2_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + tmp0 = __lsx_vpickod_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + dst_u += 8; + dst_v += 8; + } +} + +void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); + dst0 = __lsx_vpickod_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_uyvy += 32; + dst_y += 16; + } +} + +void UYVYToUVRow_LSX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0, + src_uyvy_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickev_b(src1, src0); + src1 = __lsx_vpickev_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_uyvy += 32; + src_uyvy_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void UYVYToUV422Row_LSX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_uyvy += 32; + dst_u += 8; + dst_v += 8; + } +} + +void ARGBToUVRow_LSX(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* src_argb1 = src_argb0 + src_stride_argb; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i vec0, vec1, vec2, vec3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; + __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038}; + __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025}; + __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013}; + __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f}; + __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009}; + __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0, + 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1, + 48, src4, src5, src6, src7); + vec0 = __lsx_vaddwev_h_bu(src0, src4); + vec1 = __lsx_vaddwev_h_bu(src1, src5); + vec2 = __lsx_vaddwev_h_bu(src2, src6); + vec3 = __lsx_vaddwev_h_bu(src3, src7); + tmp0 = __lsx_vpickev_h(vec1, vec0); + tmp1 = __lsx_vpickev_h(vec3, vec2); + tmp2 = __lsx_vpickod_h(vec1, vec0); + tmp3 = __lsx_vpickod_h(vec3, vec2); + vec0 = __lsx_vaddwod_h_bu(src0, src4); + vec1 = __lsx_vaddwod_h_bu(src1, src5); + vec2 = __lsx_vaddwod_h_bu(src2, src6); + vec3 = __lsx_vaddwod_h_bu(src3, src7); + tmp4 = __lsx_vpickev_h(vec1, vec0); + tmp5 = __lsx_vpickev_h(vec3, vec2); + vec0 = __lsx_vpickev_h(tmp1, tmp0); + vec1 = __lsx_vpickod_h(tmp1, tmp0); + src0 = __lsx_vavgr_h(vec0, vec1); + vec0 = __lsx_vpickev_h(tmp3, tmp2); + vec1 = __lsx_vpickod_h(tmp3, tmp2); + src1 = __lsx_vavgr_h(vec0, vec1); + vec0 = __lsx_vpickev_h(tmp5, tmp4); + vec1 = __lsx_vpickod_h(tmp5, tmp4); + src2 = __lsx_vavgr_h(vec0, vec1); + dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70); + dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A); + dst0 = __lsx_vmsub_h(dst0, src1, const_0x26); + dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70); + dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E); + dst1 = __lsx_vmsub_h(dst1, src0, const_0x12); + dst0 = __lsx_vsrai_h(dst0, 8); + dst1 = __lsx_vsrai_h(dst1, 8); + dst0 = __lsx_vpickev_b(dst1, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + src_argb0 += 64; + src_argb1 += 64; + dst_u += 8; + dst_v += 8; + } +} + +void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 16) - 1; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + __lsx_vst(tmp3, dst_rgb, 36); + dst_rgb += 48; + src_argb += 64; + } + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + dst_rgb += 36; + __lsx_vst(tmp3, dst_rgb, 0); +} + +void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 16) - 1; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + __lsx_vst(tmp3, dst_rgb, 36); + dst_rgb += 48; + src_argb += 64; + } + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + dst_rgb += 36; + __lsx_vst(tmp3, dst_rgb, 0); +} + +void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = width / 8; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, tmp0, tmp1, dst0; + __m128i shift = {0x0300030003000300, 0x0300030003000300}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp0 = __lsx_vsrli_b(tmp0, 3); + tmp1 = __lsx_vpackev_b(zero, tmp1); + tmp1 = __lsx_vsrli_h(tmp1, 2); + tmp0 = __lsx_vsll_b(tmp0, shift); + tmp1 = __lsx_vslli_h(tmp1, 5); + dst0 = __lsx_vor_v(tmp0, tmp1); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToARGB1555Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; + __m128i shift1 = {0x0703070307030703, 0x0703070307030703}; + __m128i shift2 = {0x0200020002000200, 0x0200020002000200}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp0 = __lsx_vsrli_b(tmp0, 3); + tmp1 = __lsx_vsrl_b(tmp1, shift1); + tmp0 = __lsx_vsll_b(tmp0, shift2); + tmp2 = __lsx_vpackev_b(zero, tmp1); + tmp3 = __lsx_vpackod_b(zero, tmp1); + tmp2 = __lsx_vslli_h(tmp2, 5); + tmp3 = __lsx_vslli_h(tmp3, 15); + dst0 = __lsx_vor_v(tmp0, tmp2); + dst0 = __lsx_vor_v(dst0, tmp3); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToARGB4444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vandi_b(tmp1, 0xF0); + tmp0 = __lsx_vsrli_b(tmp0, 4); + dst0 = __lsx_vor_v(tmp1, tmp0); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToUV444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, dst0, dst1; + __m128i const_112 = __lsx_vldi(112); + __m128i const_74 = __lsx_vldi(74); + __m128i const_38 = __lsx_vldi(38); + __m128i const_94 = __lsx_vldi(94); + __m128i const_18 = __lsx_vldi(18); + __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickev_h(src1, src0); + tmp1 = __lsx_vpickod_h(src1, src0); + tmp2 = __lsx_vpickev_h(src3, src2); + tmp3 = __lsx_vpickod_h(src3, src2); + reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112); + reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112); + reg2 = __lsx_vmulwod_h_bu(tmp0, const_74); + reg3 = __lsx_vmulwod_h_bu(tmp2, const_74); + reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38); + reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38); + reg0 = __lsx_vsub_h(reg0, reg2); + reg1 = __lsx_vsub_h(reg1, reg3); + reg0 = __lsx_vsrai_h(reg0, 8); + reg1 = __lsx_vsrai_h(reg1, 8); + dst0 = __lsx_vpickev_b(reg1, reg0); + + reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112); + reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112); + reg2 = __lsx_vmulwev_h_bu(tmp0, const_18); + reg3 = __lsx_vmulwev_h_bu(tmp2, const_18); + reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94); + reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94); + reg0 = __lsx_vsub_h(reg0, reg2); + reg1 = __lsx_vsub_h(reg1, reg3); + reg0 = __lsx_vsrai_h(reg0, 8); + reg1 = __lsx_vsrai_h(reg1, 8); + dst1 = __lsx_vpickev_b(reg1, reg0); + + __lsx_vst(dst0, dst_u, 0); + __lsx_vst(dst1, dst_v, 0); + dst_u += 16; + dst_v += 16; + src_argb += 64; + } +} + +void ARGBMultiplyRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + tmp0 = __lsx_vilvl_b(src0, src0); + tmp1 = __lsx_vilvh_b(src0, src0); + tmp2 = __lsx_vilvl_b(zero, src1); + tmp3 = __lsx_vilvh_b(zero, src1); + dst0 = __lsx_vmuh_hu(tmp0, tmp2); + dst1 = __lsx_vmuh_hu(tmp1, tmp3); + dst0 = __lsx_vpickev_b(dst1, dst0); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lsx_vsadd_bu(src0, src1); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBSubtractRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lsx_vssub_bu(src0, src1); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAttenuateRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i b, g, r, a, dst0, dst1; + __m128i control = {0x0005000100040000, 0x0007000300060002}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + b = __lsx_vpackev_b(tmp0, tmp0); + r = __lsx_vpackod_b(tmp0, tmp0); + g = __lsx_vpackev_b(tmp1, tmp1); + a = __lsx_vpackod_b(tmp1, tmp1); + reg0 = __lsx_vmulwev_w_hu(b, a); + reg1 = __lsx_vmulwod_w_hu(b, a); + reg2 = __lsx_vmulwev_w_hu(r, a); + reg3 = __lsx_vmulwod_w_hu(r, a); + reg4 = __lsx_vmulwev_w_hu(g, a); + reg5 = __lsx_vmulwod_w_hu(g, a); + reg0 = __lsx_vssrani_h_w(reg1, reg0, 24); + reg2 = __lsx_vssrani_h_w(reg3, reg2, 24); + reg4 = __lsx_vssrani_h_w(reg5, reg4, 24); + reg0 = __lsx_vshuf_h(control, reg0, reg0); + reg2 = __lsx_vshuf_h(control, reg2, reg2); + reg4 = __lsx_vshuf_h(control, reg4, reg4); + tmp0 = __lsx_vpackev_b(reg4, reg0); + tmp1 = __lsx_vpackev_b(a, reg2); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + src_argb += 32; + } +} + +void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0; + __m128i b, g, r; + __m128i zero = __lsx_vldi(0); + __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0); + + vec_dither = __lsx_vilvl_b(zero, vec_dither); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + b = __lsx_vpackev_b(zero, tmp0); + r = __lsx_vpackod_b(zero, tmp0); + g = __lsx_vpackev_b(zero, tmp1); + b = __lsx_vadd_h(b, vec_dither); + g = __lsx_vadd_h(g, vec_dither); + r = __lsx_vadd_h(r, vec_dither); + DUP2_ARG1(__lsx_vclip255_h, b, g, b, g); + r = __lsx_vclip255_h(r); + b = __lsx_vsrai_h(b, 3); + g = __lsx_vsrai_h(g, 2); + r = __lsx_vsrai_h(r, 3); + g = __lsx_vslli_h(g, 5); + r = __lsx_vslli_h(r, 11); + dst0 = __lsx_vor_v(b, g); + dst0 = __lsx_vor_v(dst0, r); + __lsx_vst(dst0, dst_rgb, 0); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, dst0, dst1; + __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808}; + __m128i temp = __lsx_vldrepl_w(shuffler, 0); + + shuf = __lsx_vadd_b(shuf, temp); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + dst0 = __lsx_vshuf_b(src0, src0, shuf); + dst1 = __lsx_vshuf_b(src1, src1, shuf); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + int len = width / 4; + __m128i src0, dst0, tmp0, tmp1; + __m128i vec_value = __lsx_vreplgr2vr_w(value); + + vec_value = __lsx_vilvl_b(vec_value, vec_value); + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb, 0); + tmp0 = __lsx_vilvl_b(src0, src0); + tmp1 = __lsx_vilvh_b(src0, src0); + tmp0 = __lsx_vmuh_hu(tmp0, vec_value); + tmp1 = __lsx_vmuh_hu(tmp1, vec_value); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, reg2, dst0, dst1; + __m128i const_128 = __lsx_vldi(0x480); + __m128i const_150 = __lsx_vldi(0x96); + __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + reg0 = __lsx_vdp2_h_bu(tmp0, const_br); + reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150); + reg2 = __lsx_vadd_h(reg0, reg1); + tmp0 = __lsx_vpackod_b(reg2, reg2); + tmp1 = __lsx_vpackod_b(tmp1, reg2); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, spb, spg, spr; + __m128i dst0, dst1; + __m128i spb_g = __lsx_vldi(68); + __m128i spg_g = __lsx_vldi(88); + __m128i spr_g = __lsx_vldi(98); + __m128i spb_br = {0x2311231123112311, 0x2311231123112311}; + __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16}; + __m128i spr_br = {0x3218321832183218, 0x3218321832183218}; + __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); + spr = __lsx_vdp2_h_bu(tmp0, spr_br); + spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g); + spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g); + spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g); + spb = __lsx_vsrli_h(spb, 7); + spg = __lsx_vsrli_h(spg, 7); + spr = __lsx_vsrli_h(spr, 7); + spg = __lsx_vsat_hu(spg, 7); + spr = __lsx_vsat_hu(spr, 7); + reg0 = __lsx_vpackev_b(spg, spb); + reg1 = __lsx_vshuf_b(tmp1, spr, shuff); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb4444, 0); + src1 = __lsx_vld(src_argb4444, 16); + tmp0 = __lsx_vandi_b(src0, 0x0F); + tmp1 = __lsx_vandi_b(src0, 0xF0); + tmp2 = __lsx_vandi_b(src1, 0x0F); + tmp3 = __lsx_vandi_b(src1, 0xF0); + reg0 = __lsx_vslli_b(tmp0, 4); + reg2 = __lsx_vslli_b(tmp2, 4); + reg1 = __lsx_vsrli_b(tmp1, 4); + reg3 = __lsx_vsrli_b(tmp3, 4); + DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0, + tmp1, tmp2, tmp3); + dst0 = __lsx_vilvl_b(tmp1, tmp0); + dst2 = __lsx_vilvl_b(tmp3, tmp2); + dst1 = __lsx_vilvh_b(tmp1, tmp0); + dst3 = __lsx_vilvh_b(tmp3, tmp2); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_argb4444 += 32; + } +} + +void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; + __m128i reg0, reg1, reg2; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb1555, 0); + src1 = __lsx_vld(src_argb1555, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpg = __lsx_vsrli_b(tmp0, 5); + reg0 = __lsx_vandi_b(tmp1, 0x03); + reg0 = __lsx_vslli_b(reg0, 3); + tmpg = __lsx_vor_v(tmpg, reg0); + reg1 = __lsx_vandi_b(tmp1, 0x7C); + tmpr = __lsx_vsrli_b(reg1, 2); + tmpa = __lsx_vsrli_b(tmp1, 7); + tmpa = __lsx_vneg_b(tmpa); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vslli_b(tmpg, 3); + reg2 = __lsx_vslli_b(tmpr, 3); + tmpb = __lsx_vsrli_b(tmpb, 2); + tmpg = __lsx_vsrli_b(tmpg, 2); + tmpr = __lsx_vsrli_b(tmpr, 2); + tmpb = __lsx_vor_v(reg0, tmpb); + tmpg = __lsx_vor_v(reg1, tmpg); + tmpr = __lsx_vor_v(reg2, tmpr); + DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + dst2 = __lsx_vilvl_h(reg1, reg0); + dst3 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_argb1555 += 32; + } +} + +void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr; + __m128i reg0, reg1, dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb565, 0); + src1 = __lsx_vld(src_rgb565, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpr = __lsx_vandi_b(tmp1, 0xF8); + reg1 = __lsx_vandi_b(tmp1, 0x07); + reg0 = __lsx_vsrli_b(tmp0, 5); + reg1 = __lsx_vslli_b(reg1, 3); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vsrli_b(tmpb, 2); + tmpb = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpg, 2); + reg1 = __lsx_vsrli_b(tmpg, 4); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vsrli_b(tmpr, 5); + tmpr = __lsx_vor_v(tmpr, reg0); + DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst2 = __lsx_vilvl_h(reg1, reg0); + dst3 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_rgb565 += 32; + } +} + +void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i tmp0, tmp1, tmp2; + __m128i dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514}; + __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100}; + __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C}; + __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb24, 0); + src1 = __lsx_vld(src_rgb24, 16); + src2 = __lsx_vld(src_rgb24, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + tmp2 = __lsx_vshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_rgb24 += 48; + } +} + +void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i tmp0, tmp1, tmp2; + __m128i dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514}; + __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100}; + __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C}; + __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_raw, 0); + src1 = __lsx_vld(src_raw, 16); + src2 = __lsx_vld(src_raw, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + tmp2 = __lsx_vshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_raw += 48; + } +} + +void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr; + __m128i reg0, reg1, reg2, dst0; + __m128i const_66 = __lsx_vldi(66); + __m128i const_129 = __lsx_vldi(129); + __m128i const_25 = __lsx_vldi(25); + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb1555, 0); + src1 = __lsx_vld(src_argb1555, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpg = __lsx_vsrli_b(tmp0, 5); + reg0 = __lsx_vandi_b(tmp1, 0x03); + reg0 = __lsx_vslli_b(reg0, 3); + tmpg = __lsx_vor_v(tmpg, reg0); + reg1 = __lsx_vandi_b(tmp1, 0x7C); + tmpr = __lsx_vsrli_b(reg1, 2); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vslli_b(tmpg, 3); + reg2 = __lsx_vslli_b(tmpr, 3); + tmpb = __lsx_vsrli_b(tmpb, 2); + tmpg = __lsx_vsrli_b(tmpg, 2); + tmpr = __lsx_vsrli_b(tmpr, 2); + tmpb = __lsx_vor_v(reg0, tmpb); + tmpg = __lsx_vor_v(reg1, tmpg); + tmpr = __lsx_vor_v(reg2, tmpr); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lsx_vpackod_b(reg1, reg0); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_argb1555 += 32; + } +} + +void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0, + next_argb1555, 16, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + nexb = __lsx_vandi_b(tmp2, 0x1F); + tmpg = __lsx_vsrli_b(tmp0, 5); + nexg = __lsx_vsrli_b(tmp2, 5); + reg0 = __lsx_vandi_b(tmp1, 0x03); + reg2 = __lsx_vandi_b(tmp3, 0x03); + reg0 = __lsx_vslli_b(reg0, 3); + reg2 = __lsx_vslli_b(reg2, 3); + tmpg = __lsx_vor_v(tmpg, reg0); + nexg = __lsx_vor_v(nexg, reg2); + reg1 = __lsx_vandi_b(tmp1, 0x7C); + reg3 = __lsx_vandi_b(tmp3, 0x7C); + tmpr = __lsx_vsrli_b(reg1, 2); + nexr = __lsx_vsrli_b(reg3, 2); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vslli_b(tmpg, 3); + reg2 = __lsx_vslli_b(tmpr, 3); + tmpb = __lsx_vsrli_b(tmpb, 2); + tmpg = __lsx_vsrli_b(tmpg, 2); + tmpr = __lsx_vsrli_b(tmpr, 2); + tmpb = __lsx_vor_v(reg0, tmpb); + tmpg = __lsx_vor_v(reg1, tmpg); + tmpr = __lsx_vor_v(reg2, tmpr); + reg0 = __lsx_vslli_b(nexb, 3); + reg1 = __lsx_vslli_b(nexg, 3); + reg2 = __lsx_vslli_b(nexr, 3); + nexb = __lsx_vsrli_b(nexb, 2); + nexg = __lsx_vsrli_b(nexg, 2); + nexr = __lsx_vsrli_b(nexr, 2); + nexb = __lsx_vor_v(reg0, nexb); + nexg = __lsx_vor_v(reg1, nexg); + nexr = __lsx_vor_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_argb1555 += 32; + next_argb1555 += 32; + } +} + +void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr; + __m128i reg0, reg1, dst0; + __m128i const_66 = __lsx_vldi(66); + __m128i const_129 = __lsx_vldi(129); + __m128i const_25 = __lsx_vldi(25); + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb565, 0); + src1 = __lsx_vld(src_rgb565, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpr = __lsx_vandi_b(tmp1, 0xF8); + reg1 = __lsx_vandi_b(tmp1, 0x07); + reg0 = __lsx_vsrli_b(tmp0, 5); + reg1 = __lsx_vslli_b(reg1, 3); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vsrli_b(tmpb, 2); + tmpb = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpg, 2); + reg1 = __lsx_vsrli_b(tmpg, 4); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vsrli_b(tmpr, 5); + tmpr = __lsx_vor_v(tmpr, reg0); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lsx_vpackod_b(reg1, reg0); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_rgb565 += 32; + } +} + +void RGB565ToUVRow_LSX(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0, + next_rgb565, 16, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpr = __lsx_vandi_b(tmp1, 0xF8); + nexb = __lsx_vandi_b(tmp2, 0x1F); + nexr = __lsx_vandi_b(tmp3, 0xF8); + reg1 = __lsx_vandi_b(tmp1, 0x07); + reg3 = __lsx_vandi_b(tmp3, 0x07); + reg0 = __lsx_vsrli_b(tmp0, 5); + reg1 = __lsx_vslli_b(reg1, 3); + reg2 = __lsx_vsrli_b(tmp2, 5); + reg3 = __lsx_vslli_b(reg3, 3); + tmpg = __lsx_vor_v(reg1, reg0); + nexg = __lsx_vor_v(reg2, reg3); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vsrli_b(tmpb, 2); + reg2 = __lsx_vslli_b(nexb, 3); + reg3 = __lsx_vsrli_b(nexb, 2); + tmpb = __lsx_vor_v(reg1, reg0); + nexb = __lsx_vor_v(reg2, reg3); + reg0 = __lsx_vslli_b(tmpg, 2); + reg1 = __lsx_vsrli_b(tmpg, 4); + reg2 = __lsx_vslli_b(nexg, 2); + reg3 = __lsx_vsrli_b(nexg, 4); + tmpg = __lsx_vor_v(reg1, reg0); + nexg = __lsx_vor_v(reg2, reg3); + reg0 = __lsx_vsrli_b(tmpr, 5); + reg2 = __lsx_vsrli_b(nexr, 5); + tmpr = __lsx_vor_v(tmpr, reg0); + nexr = __lsx_vor_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_rgb565 += 32; + next_rgb565 += 32; + } +} + +void RGB24ToUVRow_LSX(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; + int len = width / 16; + __m128i src0, src1, src2; + __m128i nex0, nex1, nex2, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18}; + __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908}; + __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19}; + __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908}; + __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A}; + __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb24, 0); + src1 = __lsx_vld(src_rgb24, 16); + src2 = __lsx_vld(src_rgb24, 32); + nex0 = __lsx_vld(next_rgb24, 0); + nex1 = __lsx_vld(next_rgb24, 16); + nex2 = __lsx_vld(next_rgb24, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_rgb24 += 48; + next_rgb24 += 48; + } +} + +void RAWToUVRow_LSX(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_raw = src_raw + src_stride_raw; + int len = width / 16; + __m128i src0, src1, src2; + __m128i nex0, nex1, nex2, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18}; + __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908}; + __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19}; + __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908}; + __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A}; + __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_raw, 0); + src1 = __lsx_vld(src_raw, 16); + src2 = __lsx_vld(src_raw, 32); + nex0 = __lsx_vld(next_raw, 0); + nex1 = __lsx_vld(next_raw, 16); + nex2 = __lsx_vld(next_raw, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_raw += 48; + next_raw += 48; + } +} + +void NV12ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i alpha = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_vu = __lsx_vld(src_uv, 0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 8; + src_uv += 8; + } +} + +void NV12ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_vu = __lsx_vld(src_uv, 0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + out_b = __lsx_vsrli_h(out_b, 3); + out_g = __lsx_vsrli_h(out_g, 2); + out_r = __lsx_vsrli_h(out_r, 3); + out_g = __lsx_vslli_h(out_g, 5); + out_r = __lsx_vslli_h(out_r, 11); + out_r = __lsx_vor_v(out_r, out_g); + out_r = __lsx_vor_v(out_r, out_b); + __lsx_vst(out_r, dst_rgb565, 0); + src_y += 8; + src_uv += 8; + dst_rgb565 += 16; + } +} + +void NV21ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i vec_y, vec_uv; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_ubvr, vec_ugvg; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i alpha = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_uv = __lsx_vld(src_vu, 0); + YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g, + out_b); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 8; + src_vu += 8; + } +} + +void SobelRow_LSX(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0; + __m128i out0, out1, out2, out3; + __m128i alpha = __lsx_vldi(0xFF); + __m128i shuff0 = {0x1001010110000000, 0x1003030310020202}; + __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04); + __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04); + __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_sobelx, 0); + src1 = __lsx_vld(src_sobely, 0); + tmp0 = __lsx_vsadd_bu(src0, src1); + DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha, + tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3); + __lsx_vst(out0, dst_argb, 0); + __lsx_vst(out1, dst_argb, 16); + __lsx_vst(out2, dst_argb, 32); + __lsx_vst(out3, dst_argb, 48); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_LSX(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int x; + int len = width / 32; + __m128i src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1); + DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3); + dst0 = __lsx_vsadd_bu(src0, src2); + dst1 = __lsx_vsadd_bu(src1, src3); + __lsx_vst(dst0, dst_y, 0); + __lsx_vst(dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_LSX(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src_r, src_b, src_g; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + + for (x = 0; x < len; x++) { + src_r = __lsx_vld(src_sobelx, 0); + src_b = __lsx_vld(src_sobely, 0); + src_g = __lsx_vsadd_bu(src_r, src_b); + tmp0 = __lsx_vilvl_b(src_g, src_b); + tmp1 = __lsx_vilvh_b(src_g, src_b); + tmp2 = __lsx_vilvl_b(alpha, src_r); + tmp3 = __lsx_vilvh_b(alpha, src_r); + dst0 = __lsx_vilvl_h(tmp2, tmp0); + dst1 = __lsx_vilvh_h(tmp2, tmp0); + dst2 = __lsx_vilvl_h(tmp3, tmp1); + dst3 = __lsx_vilvh_h(tmp3, tmp1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void BGRAToUVRow_LSX(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_bgra = src_bgra + src_stride_bgra; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickev_b(src1, src0); + tmp2 = __lsx_vpickod_b(src3, src2); + tmp3 = __lsx_vpickev_b(src3, src2); + tmpb = __lsx_vpickod_b(tmp2, tmp0); + tmpr = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickod_b(tmp3, tmp1); + tmp0 = __lsx_vpickod_b(nex1, nex0); + tmp1 = __lsx_vpickev_b(nex1, nex0); + tmp2 = __lsx_vpickod_b(nex3, nex2); + tmp3 = __lsx_vpickev_b(nex3, nex2); + nexb = __lsx_vpickod_b(tmp2, tmp0); + nexr = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickod_b(tmp3, tmp1); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_bgra += 64; + next_bgra += 64; + } +} + +void ABGRToUVRow_LSX(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_abgr = src_abgr + src_stride_abgr; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp2 = __lsx_vpickev_b(src3, src2); + tmp3 = __lsx_vpickod_b(src3, src2); + tmpb = __lsx_vpickod_b(tmp2, tmp0); + tmpr = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickev_b(tmp3, tmp1); + tmp0 = __lsx_vpickev_b(nex1, nex0); + tmp1 = __lsx_vpickod_b(nex1, nex0); + tmp2 = __lsx_vpickev_b(nex3, nex2); + tmp3 = __lsx_vpickod_b(nex3, nex2); + nexb = __lsx_vpickod_b(tmp2, tmp0); + nexr = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickev_b(tmp3, tmp1); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_abgr += 64; + next_abgr += 64; + } +} + +void RGBAToUVRow_LSX(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgba = src_rgba + src_stride_rgba; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickev_b(src1, src0); + tmp2 = __lsx_vpickod_b(src3, src2); + tmp3 = __lsx_vpickev_b(src3, src2); + tmpr = __lsx_vpickod_b(tmp2, tmp0); + tmpb = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickod_b(tmp3, tmp1); + tmp0 = __lsx_vpickod_b(nex1, nex0); + tmp1 = __lsx_vpickev_b(nex1, nex0); + tmp2 = __lsx_vpickod_b(nex3, nex2); + tmp3 = __lsx_vpickev_b(nex3, nex2); + nexr = __lsx_vpickod_b(tmp2, tmp0); + nexb = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickod_b(tmp3, tmp1); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_rgba += 64; + next_rgba += 64; + } +} + +void ARGBToUVJRow_LSX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_argb = src_argb + src_stride_argb; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_63 = __lsx_vldi(0x43F); + __m128i const_42 = __lsx_vldi(0x42A); + __m128i const_21 = __lsx_vldi(0x415); + __m128i const_53 = __lsx_vldi(0x435); + __m128i const_10 = __lsx_vldi(0x40A); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp2 = __lsx_vpickev_b(src3, src2); + tmp3 = __lsx_vpickod_b(src3, src2); + tmpr = __lsx_vpickod_b(tmp2, tmp0); + tmpb = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickev_b(tmp3, tmp1); + tmp0 = __lsx_vpickev_b(nex1, nex0); + tmp1 = __lsx_vpickod_b(nex1, nex0); + tmp2 = __lsx_vpickev_b(nex3, nex2); + tmp3 = __lsx_vpickod_b(nex3, nex2); + nexr = __lsx_vpickod_b(tmp2, tmp0); + nexb = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickev_b(tmp3, tmp1); + tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb); + tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb); + tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg); + tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg); + reg0 = __lsx_vaddwev_h_bu(tmpr, nexr); + reg1 = __lsx_vaddwod_h_bu(tmpr, nexr); + tmpb = __lsx_vavgr_hu(tmp0, tmp1); + tmpg = __lsx_vavgr_hu(tmp2, tmp3); + tmpr = __lsx_vavgr_hu(reg0, reg1); + reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb); + reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr); + reg0 = __lsx_vmsub_h(reg0, const_42, tmpg); + reg1 = __lsx_vmsub_h(reg1, const_53, tmpg); + reg0 = __lsx_vmsub_h(reg0, const_21, tmpr); + reg1 = __lsx_vmsub_h(reg1, const_10, tmpb); + dst0 = __lsx_vpickod_b(reg1, reg0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_argb += 64; + next_argb += 64; + } +} + +void I444ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r; + __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x480); + __m128i alpha = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_u = __lsx_vld(src_u, 0); + vec_v = __lsx_vld(src_v, 0); + vec_yl = __lsx_vilvl_b(vec_y, vec_y); + vec_ul = __lsx_vilvl_b(zero, vec_u); + vec_vl = __lsx_vilvl_b(zero, vec_v); + I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb, + out_b, out_g, out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + vec_yh = __lsx_vilvh_b(vec_y, vec_y); + vec_uh = __lsx_vilvh_b(zero, vec_u); + vec_vh = __lsx_vilvh_b(zero, vec_v); + I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb, + out_b, out_g, out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_u += 16; + src_v += 16; + } +} + +void I400ToARGBRow_LSX(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_y, vec_yl, vec_yh, out0; + __m128i y_ev, y_od, dst0, dst1, dst2, dst3; + __m128i temp0, temp1; + __m128i alpha = __lsx_vldi(0xFF); + __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]); + __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_yl = __lsx_vilvl_b(vec_y, vec_y); + y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg); + y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg); + y_ev = __lsx_vsrai_w(y_ev, 16); + y_od = __lsx_vsrai_w(y_od, 16); + y_ev = __lsx_vadd_w(y_ev, vec_yb); + y_od = __lsx_vadd_w(y_od, vec_yb); + y_ev = __lsx_vsrai_w(y_ev, 6); + y_od = __lsx_vsrai_w(y_od, 6); + y_ev = __lsx_vclip255_w(y_ev); + y_od = __lsx_vclip255_w(y_od); + out0 = __lsx_vpackev_h(y_od, y_ev); + temp0 = __lsx_vpackev_b(out0, out0); + temp1 = __lsx_vpackev_b(alpha, out0); + dst0 = __lsx_vilvl_h(temp1, temp0); + dst1 = __lsx_vilvh_h(temp1, temp0); + vec_yh = __lsx_vilvh_b(vec_y, vec_y); + y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg); + y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg); + y_ev = __lsx_vsrai_w(y_ev, 16); + y_od = __lsx_vsrai_w(y_od, 16); + y_ev = __lsx_vadd_w(y_ev, vec_yb); + y_od = __lsx_vadd_w(y_od, vec_yb); + y_ev = __lsx_vsrai_w(y_ev, 6); + y_od = __lsx_vsrai_w(y_od, 6); + y_ev = __lsx_vclip255_w(y_ev); + y_od = __lsx_vclip255_w(y_od); + out0 = __lsx_vpackev_h(y_od, y_ev); + temp0 = __lsx_vpackev_b(out0, out0); + temp1 = __lsx_vpackev_b(alpha, out0); + dst2 = __lsx_vilvl_h(temp1, temp0); + dst3 = __lsx_vilvh_h(temp1, temp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_y += 16; + } +} + +void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m128i vec_y, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i alpha = __lsx_vldi(0xFF); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + tmp0 = __lsx_vilvl_b(vec_y, vec_y); + tmp1 = __lsx_vilvh_b(vec_y, vec_y); + tmp2 = __lsx_vilvl_b(alpha, vec_y); + tmp3 = __lsx_vilvh_b(alpha, vec_y); + dst0 = __lsx_vilvl_h(tmp2, tmp0); + dst1 = __lsx_vilvh_h(tmp2, tmp0); + dst2 = __lsx_vilvl_h(tmp3, tmp1); + dst3 = __lsx_vilvh_h(tmp3, tmp1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_y += 16; + } +} + +void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i src0, vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i zero = __lsx_vldi(0); + __m128i alpha = __lsx_vldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_yuy2, 0); + vec_y = __lsx_vpickev_b(src0, src0); + vec_vu = __lsx_vpickod_b(src0, src0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_yuy2 += 16; + } +} + +void UYVYToARGBRow_LSX(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i src0, vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i zero = __lsx_vldi(0); + __m128i alpha = __lsx_vldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_uyvy, 0); + vec_y = __lsx_vpickod_b(src0, src0); + vec_vu = __lsx_vpickev_b(src0, src0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_uyvy += 16; + } +} + +void InterpolateRow_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int32_t source_y_fraction) { + int x; + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* nex_ptr = src_ptr + src_stride; + uint16_t y_fractions; + int len = width / 32; + __m128i src0, src1, nex0, nex1; + __m128i dst0, dst1, y_frac; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i const_128 = __lsx_vldi(0x480); + + if (y1_fraction == 0) { + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + __lsx_vst(src0, dst_ptr, 0); + __lsx_vst(src1, dst_ptr, 16); + src_ptr += 32; + dst_ptr += 32; + } + return; + } + + if (y1_fraction == 128) { + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1); + dst0 = __lsx_vavgr_bu(src0, nex0); + dst1 = __lsx_vavgr_bu(src1, nex1); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 32; + nex_ptr += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); + y_frac = __lsx_vreplgr2vr_h(y_fractions); + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1); + tmp0 = __lsx_vilvl_b(nex0, src0); + tmp1 = __lsx_vilvh_b(nex0, src0); + tmp2 = __lsx_vilvl_b(nex1, src1); + tmp3 = __lsx_vilvh_b(nex1, src1); + tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac); + tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac); + tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac); + tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac); + dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8); + dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 32; + nex_ptr += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + int len = width / 4; + __m128i dst0 = __lsx_vreplgr2vr_w(v32); + + for (x = 0; x < len; x++) { + __lsx_vst(dst0, dst_argb, 0); + dst_argb += 16; + } +} + +void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i dst0, dst1, dst2; + __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06}; + __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A}; + __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1); + src2 = __lsx_vld(src_raw, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1); + dst2 = __lsx_vshuf_b(src1, src2, shuf2); + dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E); + __lsx_vst(dst0, dst_rgb24, 0); + __lsx_vst(dst1, dst_rgb24, 16); + __lsx_vst(dst2, dst_rgb24, 32); + dst_rgb24 += 48; + src_raw += 48; + } +} + +void MergeUVRow_LSX(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1); + dst0 = __lsx_vilvl_b(src1, src0); + dst1 = __lsx_vilvh_b(src1, src0); + __lsx_vst(dst0, dst_uv, 0); + __lsx_vst(dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickod_b(src3, src2); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst_a, 0); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_LSX(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, dst0, dst1; + __m128i reg0, reg1, reg2, reg3; + __m128i a0, a1, a2, a3; + __m128i const_256 = __lsx_vldi(0x500); + __m128i zero = __lsx_vldi(0); + __m128i alpha = __lsx_vldi(0xFF); + __m128i control = {0xFF000000FF000000, 0xFF000000FF000000}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf4i_b(src0, 0xFF); + tmp1 = __lsx_vshuf4i_b(src1, 0xFF); + a0 = __lsx_vilvl_b(zero, tmp0); + a1 = __lsx_vilvh_b(zero, tmp0); + a2 = __lsx_vilvl_b(zero, tmp1); + a3 = __lsx_vilvh_b(zero, tmp1); + reg0 = __lsx_vilvl_b(zero, src2); + reg1 = __lsx_vilvh_b(zero, src2); + reg2 = __lsx_vilvl_b(zero, src3); + reg3 = __lsx_vilvh_b(zero, src3); + DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2, + const_256, a3, a0, a1, a2, a3); + DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1, + reg2, reg3); + DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1); + dst0 = __lsx_vsadd_bu(dst0, src0); + dst1 = __lsx_vsadd_bu(dst1, src1); + dst0 = __lsx_vbitsel_v(dst0, alpha, control); + dst1 = __lsx_vbitsel_v(dst1, alpha, control); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_LSX(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec_size = __lsx_vreplgr2vr_b(interval_size); + __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset); + __m128i vec_scale = __lsx_vreplgr2vr_w(scale); + __m128i zero = __lsx_vldi(0); + __m128i control = {0xFF000000FF000000, 0xFF000000FF000000}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48, + src0, src1, src2, src3); + reg0 = __lsx_vilvl_b(zero, src0); + reg1 = __lsx_vilvh_b(zero, src0); + reg2 = __lsx_vilvl_b(zero, src1); + reg3 = __lsx_vilvh_b(zero, src1); + reg4 = __lsx_vilvl_b(zero, src2); + reg5 = __lsx_vilvh_b(zero, src2); + reg6 = __lsx_vilvl_b(zero, src3); + reg7 = __lsx_vilvh_b(zero, src3); + tmp0 = __lsx_vilvl_h(zero, reg0); + tmp1 = __lsx_vilvh_h(zero, reg0); + tmp2 = __lsx_vilvl_h(zero, reg1); + tmp3 = __lsx_vilvh_h(zero, reg1); + tmp4 = __lsx_vilvl_h(zero, reg2); + tmp5 = __lsx_vilvh_h(zero, reg2); + tmp6 = __lsx_vilvl_h(zero, reg3); + tmp7 = __lsx_vilvh_h(zero, reg3); + DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale, + tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale, + tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16, + tmp7, tmp6, 16, reg0, reg1, reg2, reg3); + dst0 = __lsx_vpickev_b(reg1, reg0); + dst1 = __lsx_vpickev_b(reg3, reg2); + tmp0 = __lsx_vilvl_h(zero, reg4); + tmp1 = __lsx_vilvh_h(zero, reg4); + tmp2 = __lsx_vilvl_h(zero, reg5); + tmp3 = __lsx_vilvh_h(zero, reg5); + tmp4 = __lsx_vilvl_h(zero, reg6); + tmp5 = __lsx_vilvh_h(zero, reg6); + tmp6 = __lsx_vilvl_h(zero, reg7); + tmp7 = __lsx_vilvh_h(zero, reg7); + DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale, + tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale, + tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16, + tmp7, tmp6, 16, reg0, reg1, reg2, reg3); + dst2 = __lsx_vpickev_b(reg1, reg0); + dst3 = __lsx_vpickev_b(reg3, reg2); + DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size, + dst3, vec_size, dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2, + vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3); + DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2, + src2, control, dst3, src3, control, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0, dst1; + __m128i tmp_b, tmp_g, tmp_r, tmp_a; + __m128i reg_b, reg_g, reg_r, reg_a; + __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0); + __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4); + __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8); + __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12); + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r, + src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a); + DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r, + src1, matrix_a, reg_b, reg_g, reg_r, reg_a); + DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a, + tmp_a, tmp_b, tmp_g, tmp_r, tmp_a); + DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a, + reg_a, reg_b, reg_g, reg_r, reg_a); + DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b, + tmp_g, tmp_r, tmp_a); + DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b, + reg_g, reg_r, reg_a); + DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r, + tmp_a) + DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r, + reg_a) + DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a, + tmp_a, tmp_b, tmp_g, tmp_r, tmp_a); + tmp0 = __lsx_vpackev_b(tmp_g, tmp_b); + tmp1 = __lsx_vpackev_b(tmp_a, tmp_r); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_LSX(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3); + __lsx_vst(dst0, dst_u, 0); + __lsx_vst(dst1, dst_u, 16); + __lsx_vst(dst2, dst_v, 0); + __lsx_vst(dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) { + int x; + int len = width / 16; + __m128i dst0 = __lsx_vreplgr2vr_b(v8); + + for (x = 0; x < len; x++) { + __lsx_vst(dst0, dst, 0); + dst += 16; + } +} + +void MirrorSplitUVRow_LSX(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E}; + __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F}; + + src_uv += (width << 1); + for (x = 0; x < len; x++) { + src_uv -= 64; + DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2, + src3, src0, src1); + DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0, + shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_v, 0); + __lsx_vst(dst1, dst_v, 16); + __lsx_vst(dst2, dst_u, 0); + __lsx_vst(dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void HalfFloatRow_LSX(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int x; + int len = width / 32; + float mult = 1.9259299444e-34f * scale; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0); + __m128i zero = __lsx_vldi(0); + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4, + reg6); + DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5, + reg7); + DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult, + reg3, vec_mult, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult, + reg7, vec_mult, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13, + (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13, + (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, + dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + __lsx_vst(dst3, dst, 48); + src += 32; + dst += 32; + } +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants + "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants + "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants + "1: \n\t" + "vld $vr4, %0, 0 \n\t" + "vld $vr5, %0, 16 \n\t" + "vld $vr6, %0, 32 \n\t" + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // ARGB + "vor.v $vr12, $vr3, $vr3 \n\t" + "vor.v $vr13, $vr3, $vr3 \n\t" + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR + "vpickev.b $vr10, $vr7, $vr6 \n\t" + "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA + "vpickod.b $vr11, $vr7, $vr6 \n\t" + "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B + "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t" + "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G + "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t" + "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R + "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t" + "addi.d %0, %0, 64 \n\t" + "vpickod.b $vr10, $vr13, $vr12 \n\t" + "vst $vr10, %1, 0 \n\t" + "addi.d %1, %1, 16 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants) + : "memory"); +} + +void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants + "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants + "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants + "1: \n\t" + "vld $vr4, %0, 0 \n\t" + "vld $vr5, %0, 16 \n\t" + "vld $vr6, %0, 32 \n\t" + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // RGBA + "vor.v $vr12, $vr3, $vr3 \n\t" + "vor.v $vr13, $vr3, $vr3 \n\t" + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG + "vpickev.b $vr10, $vr7, $vr6 \n\t" + "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR + "vpickod.b $vr11, $vr7, $vr6 \n\t" + "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B + "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t" + "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G + "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t" + "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R + "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t" + "addi.d %0, %0, 64 \n\t" + "vpickod.b $vr10, $vr13, $vr12 \n\t" + "vst $vr10, %1, 0 \n\t" + "addi.d %1, %1, 16 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants) + : "memory"); +} + +void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants); +} + +static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, + 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, + 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, + 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, + 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( + "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants + "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants + "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants + "vld $vr4, %4, 0 \n\t" // load shuff + "vld $vr5, %4, 16 \n\t" + "vld $vr6, %4, 32 \n\t" + "vld $vr7, %4, 48 \n\t" + "1: \n\t" + "vld $vr8, %0, 0 \n\t" + "vld $vr9, %0, 16 \n\t" + "vld $vr10, %0, 32 \n\t" // load 16 pixels of + // RGB + "vor.v $vr12, $vr3, $vr3 \n\t" + "vor.v $vr13, $vr3, $vr3 \n\t" + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t" + "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t" + "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t" + "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t" + "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G + "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t" + "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B + "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t" + "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R + "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t" + "addi.d %0, %0, 48 \n\t" + "vpickod.b $vr10, $vr13, $vr12 \n\t" + "vst $vr10, %1, 0 \n\t" + "addi.d %1, %1, 16 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), // %3 + "r"(shuff) // %4 + : "memory"); +} + +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/source/row_msa.cc b/source/row_msa.cc new file mode 100644 index 00000000..b7d5bb5e --- /dev/null +++ b/source/row_msa.cc @@ -0,0 +1,3597 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ + { \ + ub = __msa_fill_w(yuvconst->kUVToB[0]); \ + vr = __msa_fill_w(yuvconst->kUVToR[1]); \ + ug = __msa_fill_w(yuvconst->kUVToG[0]); \ + vg = __msa_fill_w(yuvconst->kUVToG[1]); \ + yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \ + } + +// Load YUV 422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m; \ + uint32_t u_m, v_m; \ + v4i32 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ + out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ + } + +// Clip input vector elements between 0 to 255 +#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ + { \ + v4i32 max_m = __msa_ldi_w(0xFF); \ + \ + in0 = __msa_maxi_s_w(in0, 0); \ + in1 = __msa_maxi_s_w(in1, 0); \ + in2 = __msa_maxi_s_w(in2, 0); \ + in3 = __msa_maxi_s_w(in3, 0); \ + in4 = __msa_maxi_s_w(in4, 0); \ + in5 = __msa_maxi_s_w(in5, 0); \ + in0 = __msa_min_s_w(max_m, in0); \ + in1 = __msa_min_s_w(max_m, in1); \ + in2 = __msa_min_s_w(max_m, in2); \ + in3 = __msa_min_s_w(max_m, in3); \ + in4 = __msa_min_s_w(max_m, in4); \ + in5 = __msa_min_s_w(max_m, in5); \ + } + +// Convert 8 pixels of YUV 420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v16i8 temp_m, zero_m = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ + vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80); \ + temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg0_m += yb; \ + reg1_m += yb; \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m + reg5_m; \ + reg6_m = reg1_m + reg6_m; \ + reg2_m = reg0_m + reg2_m; \ + reg3_m = reg1_m + reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ + { \ + v8i16 vec0_m, vec1_m; \ + v16u8 dst0_m, dst1_m; \ + vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ + dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ + ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ + } + +// Takes ARGB input and calculates Y. +#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ + y_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8u16 reg0_m, reg1_m; \ + \ + vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ + vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ + vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ + vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const0); \ + reg1_m = __msa_dotp_u_h(vec1_m, const0); \ + reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ + reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ + reg0_m += const2; \ + reg1_m += const2; \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ + y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Loads current and next row of ARGB input and averages it to calculate U and V +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ + { \ + v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ + v8u16 reg8_m, reg9_m; \ + \ + src0_m = (v16u8)__msa_ld_b((void*)s, 0); \ + src1_m = (v16u8)__msa_ld_b((void*)s, 16); \ + src2_m = (v16u8)__msa_ld_b((void*)s, 32); \ + src3_m = (v16u8)__msa_ld_b((void*)s, 48); \ + src4_m = (v16u8)__msa_ld_b((void*)t, 0); \ + src5_m = (v16u8)__msa_ld_b((void*)t, 16); \ + src6_m = (v16u8)__msa_ld_b((void*)t, 32); \ + src7_m = (v16u8)__msa_ld_b((void*)t, 48); \ + vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ + reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ + reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m += const_0x0101; \ + reg9_m += const_0x0101; \ + reg0_m += const_0x0101; \ + reg1_m += const_0x0101; \ + argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ + argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ + argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ + argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ + } + +#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, shift, u_out, v_out) \ + { \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const0); \ + reg1_m = __msa_dotp_u_w(vec1_m, const0); \ + reg2_m = __msa_dotp_u_w(vec4_m, const0); \ + reg3_m = __msa_dotp_u_w(vec5_m, const0); \ + reg0_m += const1; \ + reg1_m += const1; \ + reg2_m += const1; \ + reg3_m += const1; \ + reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ + reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ + reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ + reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ + reg0_m = __msa_srl_w(reg0_m, shift); \ + reg1_m = __msa_srl_w(reg1_m, shift); \ + reg2_m = __msa_srl_w(reg2_m, shift); \ + reg3_m = __msa_srl_w(reg3_m, shift); \ + u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const1); \ + reg1_m = __msa_dotp_u_w(vec1_m, const1); \ + reg2_m = __msa_dotp_u_w(vec4_m, const1); \ + reg3_m = __msa_dotp_u_w(vec5_m, const1); \ + reg0_m += (v4u32)const3; \ + reg1_m += (v4u32)const3; \ + reg2_m += (v4u32)const3; \ + reg3_m += (v4u32)const3; \ + reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ + u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ + v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ + } + +// Load I444 pixel data +#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m, u_m, v_m; \ + v2i64 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LD(psrc_u); \ + v_m = LD(psrc_v); \ + out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ + out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ + } + +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ + { \ + v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \ + v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \ + _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \ + _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \ + _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \ + _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \ + _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \ + _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \ + _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \ + _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \ + _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \ + _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \ + _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \ + _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \ + _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \ + _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \ + _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \ + _reg1 = const_8080 + const_112 * _reg0; \ + _reg3 = const_8080 + const_112 * _reg4; \ + _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \ + _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \ + _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \ + _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \ + _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \ + } + +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + src += width - 64; + + for (x = 0; x < width; x += 64) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + v8u16 src, dst; + v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0}; + src_uv += (width - 8) << 1; + for (x = 0; x < width; x += 8) { + src = LD_UH(src_uv); + dst = __msa_vshf_h(shuffler, src, src); + ST_UH(dst, dst_uv); + src_uv -= 16; + dst_uv += 16; + } +} + +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + src += width * 4 - 64; + + for (x = 0; x < width; x += 16) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); + ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); + ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); + ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); + ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + STOREARGB(alpha, vec0, vec1, vec2, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int64_t data_a; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v4i32 zero = {0}; + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + data_a = LD(src_a); + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); + STOREARGB(vec0, vec1, vec2, src3, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + src_a += 8; + dst_argb += 32; + } +} + +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int64_t data_u, data_v; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v16u8 reg0, reg1, reg2, reg3; + v2i64 zero = {0}; + v8i16 const_0x80 = __msa_ldi_h(0x80); + v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; + v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; + v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, + 11, 29, 12, 13, 30, 14, 15, 31}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); + data_u = LD(src_u); + data_v = LD(src_v); + src1 = (v16u8)__msa_insert_d(zero, 0, data_u); + src2 = (v16u8)__msa_insert_d(zero, 0, data_v); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5); + reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); + reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); + reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); + ST_UB2(dst0, dst1, dst_argb, 16); + ST_UB(dst2, (dst_argb + 32)); + src_y += 16; + src_u += 8; + src_v += 8; + dst_argb += 48; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + vec0 = __msa_srli_h(vec0, 3); + vec1 = __msa_srli_h(vec1, 2); + vec2 = __msa_srli_h(vec2, 3); + vec2 = __msa_slli_h(vec2, 11); + vec1 = __msa_slli_h(vec1, 5); + vec0 |= vec1; + dst0 = (v16u8)(vec2 | vec0); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_u += 4; + src_v += 4; + dst_rgb565 += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); + v8u16 mask = (v8u16)__msa_fill_h(0x00F0); + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + reg0 = (v8u16)__msa_srli_h(vec0, 4); + reg2 = (v8u16)__msa_srli_h(vec2, 4); + reg1 = (v8u16)__msa_and_v(vec1, mask); + reg2 = (v8u16)__msa_slli_h(reg2, 8); + reg1 |= const_0xF000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb4444); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb4444 += 16; + } +} + +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + reg0 = (v8u16)__msa_srli_h(vec0, 3); + reg1 = (v8u16)__msa_srli_h(vec1, 3); + reg2 = (v8u16)__msa_srli_h(vec2, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); + reg1 |= const_0x8000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb1555); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb1555 += 16; + } +} + +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16i8 zero = {0}; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); + reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); + reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); + reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb += 64; + dst_y += 16; + } +} + +void ARGBToUVRow_MSA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* src_argb_next = src_argb + src_stride_argb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v16u8 dst0, dst1; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 = __msa_hadd_u_h(vec8, vec8); + reg1 = __msa_hadd_u_h(vec9, vec9); + reg2 = __msa_hadd_u_h(vec4, vec4); + reg3 = __msa_hadd_u_h(vec5, vec5); + reg4 = __msa_hadd_u_h(vec0, vec0); + reg5 = __msa_hadd_u_h(vec1, vec1); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 += __msa_hadd_u_h(vec8, vec8); + reg1 += __msa_hadd_u_h(vec9, vec9); + reg2 += __msa_hadd_u_h(vec4, vec4); + reg3 += __msa_hadd_u_h(vec5, vec5); + reg4 += __msa_hadd_u_h(vec0, vec0); + reg5 += __msa_hadd_u_h(vec1, vec1); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg4 += const_0x0001; + reg5 += const_0x0001; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); + reg6 = reg0 * const_0x70; + reg7 = reg1 * const_0x70; + reg8 = reg2 * const_0x4A; + reg9 = reg3 * const_0x4A; + reg6 += const_0x8080; + reg7 += const_0x8080; + reg8 += reg4 * const_0x26; + reg9 += reg5 * const_0x26; + reg0 *= const_0x12; + reg1 *= const_0x12; + reg2 *= const_0x5E; + reg3 *= const_0x5E; + reg4 *= const_0x70; + reg5 *= const_0x70; + reg2 += reg0; + reg3 += reg1; + reg4 += const_0x8080; + reg5 += const_0x8080; + reg6 -= reg8; + reg7 -= reg9; + reg4 -= reg2; + reg5 -= reg3; + reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); + reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb += 128; + src_argb_next += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; + v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, + 16, 17, 18, 20, 21, 22, 24, 25}; + v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, + 21, 22, 24, 25, 26, 28, 29, 30}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; + v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, + 18, 17, 16, 22, 21, 20, 26, 25}; + v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, + 21, 20, 26, 25, 24, 30, 29, 28}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); + vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); + vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec1 = __msa_binsli_b(vec2, vec3, 4); + vec4 = __msa_binsli_b(vec4, vec5, 2); + vec5 = __msa_binsli_b(vec6, vec7, 4); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); + vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); + vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); + vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); + vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); + vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); + vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); + vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec5 = __msa_binsli_b(vec5, vec6, 2); + vec1 = __msa_binsli_b(vec2, vec3, 5); + vec6 = __msa_binsli_b(vec7, vec8, 5); + vec1 = __msa_binsli_b(vec1, vec4, 0); + vec6 = __msa_binsli_b(vec6, vec9, 0); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1; + v16u8 vec0, vec1; + v16u8 dst0; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); + vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); + src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); + src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); + vec0 = __msa_binsli_b(vec0, src0, 3); + vec1 = __msa_binsli_b(vec1, src1, 3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int32_t x; + v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11; + v8u16 const_112 = (v8u16)__msa_ldi_h(112); + v8u16 const_74 = (v8u16)__msa_ldi_h(74); + v8u16 const_38 = (v8u16)__msa_ldi_h(38); + v8u16 const_94 = (v8u16)__msa_ldi_h(94); + v8u16 const_18 = (v8u16)__msa_ldi_h(18); + v8u16 const_32896 = (v8u16)__msa_fill_h(32896); + v16i8 zero = {0}; + + for (x = width; x > 0; x -= 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec10 = vec0 * const_18; + vec11 = vec1 * const_18; + vec8 = vec2 * const_94; + vec9 = vec3 * const_94; + vec6 = vec4 * const_112; + vec7 = vec5 * const_112; + vec0 *= const_112; + vec1 *= const_112; + vec2 *= const_74; + vec3 *= const_74; + vec4 *= const_38; + vec5 *= const_38; + vec8 += vec10; + vec9 += vec11; + vec6 += const_32896; + vec7 += const_32896; + vec0 += const_32896; + vec1 += const_32896; + vec2 += vec4; + vec3 += vec5; + vec0 -= vec2; + vec1 -= vec3; + vec6 -= vec8; + vec7 -= vec9; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); + vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBMultiplyRow_MSA(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + v8i16 zero = {0}; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_MSA(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_MSA(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); + dst0 = __msa_subs_u_b(src0, src2); + dst1 = __msa_subs_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 zero = {0}; + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); + vec4 = (v8u16)__msa_fill_h(vec0[3]); + vec5 = (v8u16)__msa_fill_h(vec0[7]); + vec6 = (v8u16)__msa_fill_h(vec1[3]); + vec7 = (v8u16)__msa_fill_h(vec1[7]); + vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec6 = (v8u16)__msa_fill_h(vec2[3]); + vec7 = (v8u16)__msa_fill_h(vec2[7]); + vec8 = (v8u16)__msa_fill_h(vec3[3]); + vec9 = (v8u16)__msa_fill_h(vec3[7]); + vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); + reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); + reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); + reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); + reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); + reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); + reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); + reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, src0, mask); + dst1 = __msa_bmnz_v(dst1, src1, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + v16u8 src0, src1, dst0, vec0, vec1; + v8i16 vec_d0; + v8i16 reg0, reg1, reg2; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(0xFF); + + vec_d0 = (v8i16)__msa_fill_w(dither4); + vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); + reg0 += vec_d0; + reg1 += vec_d0; + reg2 += vec_d0; + reg0 = __msa_maxi_s_h((v8i16)reg0, 0); + reg1 = __msa_maxi_s_h((v8i16)reg1, 0); + reg2 = __msa_maxi_s_h((v8i16)reg2, 0); + reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); + reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); + reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); + reg0 = __msa_srai_h(reg0, 3); + reg2 = __msa_srai_h(reg2, 3); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_slli_h(reg2, 11); + reg1 = __msa_slli_h(reg1, 5); + reg0 |= reg1; + dst0 = (v16u8)(reg0 | reg2); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v16i8 vec0; + v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + int32_t val = LW((int32_t*)shuffler); + + vec0 = (v16i8)__msa_fill_w(val); + shuffler_vec += vec0; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + v16u8 src0, dst0; + v8u16 vec0, vec1; + v4u32 reg0, reg1, reg2, reg3, rgba_scale; + v8i16 zero = {0}; + + rgba_scale[0] = value; + rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); + rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= rgba_scale; + reg1 *= rgba_scale; + reg2 *= rgba_scale; + reg3 *= rgba_scale; + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0, dst1; + v8u16 reg0; + v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D); + v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + reg0 = __msa_dotp_u_h(vec0, const_0x961D); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8); + vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2; + v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); + v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); + v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); + v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); + v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); + v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); + v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); + reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); + reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); + reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); + reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); + reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); + reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); + reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); + reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); + vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); + vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); + vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); + ST_UB2(dst0, dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1; + v8u16 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16); + vec0 = (v8u16)__msa_andi_b(src0, 0x0F); + vec1 = (v8u16)__msa_andi_b(src1, 0x0F); + vec2 = (v8u16)__msa_andi_b(src0, 0xF0); + vec3 = (v8u16)__msa_andi_b(src1, 0xF0); + vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); + vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); + vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); + vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb4444 += 32; + dst_argb += 64; + } +} + +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; + v16u8 dst0, dst1, dst2, dst3; + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); + reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); + reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); + reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); + reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); + reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); + reg3 = -reg3; + reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); + reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); + reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); + reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb1555 += 32; + dst_argb += 64; + } +} + +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); + res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); + res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb565 += 32; + dst_argb += 64; + } +} + +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0); + src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16); + src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb24 += 48; + dst_argb += 64; + } +} + +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_raw += 48; + dst_argb += 64; + } +} + +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; + v16u8 reg0, reg1, reg2, dst; + v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; + v8i16 res0, res1; + v8i16 const_66 = (v8i16)__msa_ldi_h(66); + v8i16 const_129 = (v8i16)__msa_ldi_h(129); + v8i16 const_25 = (v8i16)__msa_ldi_h(25); + v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080); + v16u8 zero = (v16u8)__msa_ldi_b(0); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpg = (v16u8)__msa_srli_b(tmp0, 5); + reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); + reg0 = (v16u8)__msa_slli_b(reg0, 3); + tmpg = (v16u8)__msa_or_v(tmpg, reg0); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); + tmpr = (v16u8)__msa_srli_b(reg1, 2); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_slli_b(tmpg, 3); + reg2 = (v16u8)__msa_slli_b(tmpr, 3); + tmpb = (v16u8)__msa_srli_b(tmpb, 2); + tmpg = (v16u8)__msa_srli_b(tmpg, 2); + tmpr = (v16u8)__msa_srli_b(tmpr, 2); + tmpb = (v16u8)__msa_or_v(reg0, tmpb); + tmpg = (v16u8)__msa_or_v(reg1, tmpg); + tmpr = (v16u8)__msa_or_v(reg2, tmpr); + tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); + tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); + tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); + tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); + tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); + tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); + res0 = const_1080 + const_25 * tmpb_r; + res1 = const_1080 + const_25 * tmpb_l; + res0 += const_129 * tmpg_r; + res1 += const_129 * tmpg_l; + res0 += const_66 * tmpr_r; + res1 += const_66 * tmpr_l; + dst = (v16u8)__msa_pckod_b(res1, res0); + ST_UB(dst, dst_y); + src_argb1555 += 32; + dst_y += 16; + } +} + +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; + v16u8 reg0, reg1, dst; + v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; + v8i16 res0, res1; + v8i16 const_66 = (v8i16)__msa_ldi_h(66); + v8i16 const_129 = (v8i16)__msa_ldi_h(129); + v8i16 const_25 = (v8i16)__msa_ldi_h(25); + v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080); + v16u8 zero = __msa_ldi_b(0); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0); + src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); + reg0 = (v16u8)__msa_srli_b(tmp0, 5); + reg1 = (v16u8)__msa_slli_b(reg1, 3); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_srli_b(tmpb, 2); + tmpb = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_slli_b(tmpg, 2); + reg1 = (v16u8)__msa_srli_b(tmpg, 4); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_srli_b(tmpr, 5); + tmpr = (v16u8)__msa_or_v(tmpr, reg0); + tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); + tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); + tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); + tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); + tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); + tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); + res0 = const_1080 + const_25 * tmpb_r; + res1 = const_1080 + const_25 * tmpb_l; + res0 += const_129 * tmpg_r; + res1 += const_129 * tmpg_l; + res0 += const_66 * tmpr_r; + res1 += const_66 * tmpr_l; + dst = (v16u8)__msa_pckod_b(res1, res0); + ST_UB(dst, dst_y); + src_rgb565 += 32; + dst_y += 16; + } +} + +void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); + v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb += 48; + dst_y += 16; + } +} + +void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); + v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb += 48; + dst_y += 16; + } +} + +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_argb1555; + const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); + int64_t res0, res1; + v16u8 src0, src1, src2, src3, dst; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 reg0, reg1, reg2, reg3; + v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; + v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); + v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); + v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); + v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); + v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); + v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((void*)s, 0); + src1 = (v8u16)__msa_ld_b((void*)s, 16); + src2 = (v8u16)__msa_ld_b((void*)t, 0); + src3 = (v8u16)__msa_ld_b((void*)t, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmp2 = (v16u8)__msa_pckev_b(src3, src2); + tmp3 = (v16u8)__msa_pckod_b(src3, src2); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); + tmpg = (v16u8)__msa_srli_b(tmp0, 5); + nexg = (v16u8)__msa_srli_b(tmp2, 5); + reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); + reg2 = (v16u8)__msa_andi_b(tmp3, 0x03); + reg0 = (v16u8)__msa_slli_b(reg0, 3); + reg2 = (v16u8)__msa_slli_b(reg2, 3); + tmpg = (v16u8)__msa_or_v(tmpg, reg0); + nexg = (v16u8)__msa_or_v(nexg, reg2); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); + reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C); + tmpr = (v16u8)__msa_srli_b(reg1, 2); + nexr = (v16u8)__msa_srli_b(reg3, 2); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_slli_b(tmpg, 3); + reg2 = (v16u8)__msa_slli_b(tmpr, 3); + tmpb = (v16u8)__msa_srli_b(tmpb, 2); + tmpg = (v16u8)__msa_srli_b(tmpg, 2); + tmpr = (v16u8)__msa_srli_b(tmpr, 2); + tmpb = (v16u8)__msa_or_v(reg0, tmpb); + tmpg = (v16u8)__msa_or_v(reg1, tmpg); + tmpr = (v16u8)__msa_or_v(reg2, tmpr); + reg0 = (v16u8)__msa_slli_b(nexb, 3); + reg1 = (v16u8)__msa_slli_b(nexg, 3); + reg2 = (v16u8)__msa_slli_b(nexr, 3); + nexb = (v16u8)__msa_srli_b(nexb, 2); + nexg = (v16u8)__msa_srli_b(nexg, 2); + nexr = (v16u8)__msa_srli_b(nexr, 2); + nexb = (v16u8)__msa_or_v(reg0, nexb); + nexg = (v16u8)__msa_or_v(reg1, nexg); + nexr = (v16u8)__msa_or_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); + res0 = __msa_copy_u_d((v2i64)dst, 0); + res1 = __msa_copy_u_d((v2i64)dst, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_rgb565; + const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); + int64_t res0, res1; + v16u8 src0, src1, src2, src3, dst; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 reg0, reg1, reg2, reg3; + v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; + v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); + v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); + v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); + v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); + v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); + v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmp2 = (v16u8)__msa_pckev_b(src3, src2); + tmp3 = (v16u8)__msa_pckod_b(src3, src2); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); + nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); + nexr = (v16u8)__msa_andi_b(tmp3, 0xF8); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); + reg3 = (v16u8)__msa_andi_b(tmp3, 0x07); + reg0 = (v16u8)__msa_srli_b(tmp0, 5); + reg1 = (v16u8)__msa_slli_b(reg1, 3); + reg2 = (v16u8)__msa_srli_b(tmp2, 5); + reg3 = (v16u8)__msa_slli_b(reg3, 3); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + nexg = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_srli_b(tmpb, 2); + reg2 = (v16u8)__msa_slli_b(nexb, 3); + reg3 = (v16u8)__msa_srli_b(nexb, 2); + tmpb = (v16u8)__msa_or_v(reg1, reg0); + nexb = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_slli_b(tmpg, 2); + reg1 = (v16u8)__msa_srli_b(tmpg, 4); + reg2 = (v16u8)__msa_slli_b(nexg, 2); + reg3 = (v16u8)__msa_srli_b(nexg, 4); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + nexg = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_srli_b(tmpr, 5); + reg2 = (v16u8)__msa_srli_b(nexr, 5); + tmpr = (v16u8)__msa_or_v(tmpr, reg0); + nexr = (v16u8)__msa_or_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); + res0 = __msa_copy_u_d((v2i64)dst, 0); + res1 = __msa_copy_u_d((v2i64)dst, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB24ToUVRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + int64_t res0, res1; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((void*)s, 0); + inp1 = (v16u8)__msa_ld_b((void*)s, 16); + inp2 = (v16u8)__msa_ld_b((void*)s, 32); + inp3 = (v16u8)__msa_ld_b((void*)t, 0); + inp4 = (v16u8)__msa_ld_b((void*)t, 16); + inp5 = (v16u8)__msa_ld_b((void*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h((v8i16)reg0, 1); + reg1 = __msa_srai_h((v8i16)reg1, 1); + reg2 = __msa_srai_h((v8i16)reg2, 1); + reg3 = __msa_srai_h((v8i16)reg3, 1); + vec4 = (v8u16)__msa_pckev_h(reg1, reg0); + vec5 = (v8u16)__msa_pckev_h(reg3, reg2); + vec6 = (v8u16)__msa_pckod_h(reg1, reg0); + vec7 = (v8u16)__msa_pckod_h(reg3, reg2); + vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void RAWToUVRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + int64_t res0, res1; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((void*)s, 0); + inp1 = (v16u8)__msa_ld_b((void*)s, 16); + inp2 = (v16u8)__msa_ld_b((void*)s, 32); + inp3 = (v16u8)__msa_ld_b((void*)t, 0); + inp4 = (v16u8)__msa_ld_b((void*)t, 16); + inp5 = (v16u8)__msa_ld_b((void*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h(reg0, 1); + reg1 = __msa_srai_h(reg1, 1); + reg2 = __msa_srai_h(reg2, 1); + reg3 = __msa_srai_h(reg3, 1); + vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_uv += 8; + dst_argb += 32; + } +} + +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v8i16 const_0x80 = __msa_ldi_h(0x80); + v16u8 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + vec0 = vec0 >> 3; + vec1 = (vec1 >> 2) << 5; + vec2 = (vec2 >> 3) << 11; + dst0 = (v16u8)(vec0 | vec1 | vec2); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_uv += 8; + dst_rgb565 += 16; + } +} + +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16u8 zero = {0}; + v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_vu); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_vu += 8; + dst_argb += 32; + } +} + +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; + v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; + v16i8 const_0x4 = __msa_ldi_b(0x4); + v16i8 mask1 = mask0 + const_0x4; + v16i8 mask2 = mask1 + const_0x4; + v16i8 mask3 = mask2 + const_0x4; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); + dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, vec1, vec2; + v16u8 reg0, reg1, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); + reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); + reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); + v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D); + v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb += 64; + dst_y += 16; + } +} + +void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); + v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb += 64; + dst_y += 16; + } +} + +void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); + v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb += 64; + dst_y += 16; + } +} + +void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); + v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb += 64; + dst_y += 16; + } +} + +void ARGBToUVJRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + v8u16 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3; + v8u16 dst0, dst1, dst2, dst3; + v16u8 zero = {0}; + v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); + v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); + v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); + v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); + v4i32 shift = __msa_fill_w(0x00000008); + + for (x = 0; x < width; x += 32) { + src1 = __msa_ld_b((void*)s, 0); + src3 = __msa_ld_b((void*)s, 16); + src5 = __msa_ld_b((void*)t, 0); + src7 = __msa_ld_b((void*)t, 16); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 32); + src3 = __msa_ld_b((void*)s, 48); + src5 = __msa_ld_b((void*)t, 32); + src7 = __msa_ld_b((void*)t, 48); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst0, dst1); + + src1 = __msa_ld_b((void*)s, 64); + src3 = __msa_ld_b((void*)s, 80); + src5 = __msa_ld_b((void*)t, 64); + src7 = __msa_ld_b((void*)t, 80); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 96); + src3 = __msa_ld_b((void*)s, 112); + src5 = __msa_ld_b((void*)t, 96); + src7 = __msa_ld_b((void*)t, 112); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst2, dst3); + + dst0 = (v8u16)__msa_pckev_b(dst2, dst0); + dst1 = (v8u16)__msa_pckev_b(dst3, dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void BGRAToUVRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); + + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; + } +} + +void ABGRToUVRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); + + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; + } +} + +void RGBAToUVRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; + v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); + + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; + } +} + +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 zero = {0}; + v4i32 const_0x80 = __msa_fill_w(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + + for (x = 0; x < width; x += 8) { + READI444(src_y, src_u, src_v, src0, src1, src2); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg0 += vec_yb; + reg1 += vec_yb; + vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); + reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + reg6 -= const_0x80; + reg7 -= const_0x80; + reg8 -= const_0x80; + reg9 -= const_0x80; + tmp0 = reg0 + reg6 * vec_ub; + tmp1 = reg1 + reg7 * vec_ub; + tmp2 = reg0 + reg8 * vec_vr; + tmp3 = reg1 + reg9 * vec_vr; + tmp4 = reg6 * vec_ug; + tmp5 = reg7 * vec_ug; + tmp4 += reg8 * vec_vg; + tmp5 += reg9 * vec_vg; + tmp4 = reg0 - tmp4; + tmp5 = reg1 - tmp5; + reg0 = __msa_srai_w(tmp0, 6); + reg1 = __msa_srai_w(tmp1, 6); + reg2 = __msa_srai_w(tmp2, 6); + reg3 = __msa_srai_w(tmp3, 6); + reg4 = __msa_srai_w(tmp4, 6); + reg5 = __msa_srai_w(tmp5, 6); + CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); + dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_u += 8; + src_v += 8; + dst_argb += 32; + } +} + +// TODO - respect YuvConstants +void I400ToARGBRow_MSA(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; +#if defined(__aarch64__) || defined(__arm__) + int ygb = yuvconstants->kUVBiasBGR[3]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1; + v4i32 reg0, reg1, reg2, reg3; + v4i32 vec_yg = __msa_fill_w(yg); + v8i16 vec_ygb = __msa_fill_h(ygb); + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 max = __msa_ldi_h(0xFF); + v8i16 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_y, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h(zero, vec0); + reg1 = (v4i32)__msa_ilvl_h(zero, vec0); + reg2 = (v4i32)__msa_ilvr_h(zero, vec1); + reg3 = (v4i32)__msa_ilvl_h(zero, vec1); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg2 *= vec_yg; + reg3 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg2 = __msa_srai_w(reg2, 16); + reg3 = __msa_srai_w(reg3, 16); + vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec0 += vec_ygb; + vec1 += vec_ygb; + vec0 = __msa_srai_h(vec0, 6); + vec1 = __msa_srai_h(vec1, 6); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); + res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); + res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); + res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_y, 0); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); + src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_yuy2 += 16; + dst_argb += 32; + } +} + +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; + v4i32 vec_ubvr, vec_ugvg; + v8i16 const_0x80 = __msa_ldi_h(0x80); + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); + src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_uyvy += 16; + dst_argb += 32; + } +} + +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int32_t source_y_fraction) { + int32_t y1_fraction = source_y_fraction; + int32_t y0_fraction = 256 - y1_fraction; + uint16_t y_fractions; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, y_frac; + + if (0 == y1_fraction) { + memcpy(dst_ptr, src_ptr, width); + return; + } + + if (128 == y1_fraction) { + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); + dst0 = __msa_aver_u_b(src0, src2); + dst1 = __msa_aver_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); + y_frac = (v8u16)__msa_fill_h(y_fractions); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); + vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); + vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); + vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + v4i32 dst0 = __builtin_msa_fill_w(v32); + + for (x = 0; x < width; x += 4) { + ST_UB(dst0, dst_argb); + dst_argb += 16; + } +} + +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; + v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, + 18, 17, 16, 21, 20, 19, 24, 23}; + v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, + 24, 23, 28, 27, 26, 31, 30, 29}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); + src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); + ST_UB2(dst0, dst1, dst_rgb24, 16); + ST_UB(dst2, (dst_rgb24 + 32)); + src_raw += 48; + dst_rgb24 += 48; + } +} + +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_u, 0); + src1 = (v16u8)__msa_ld_b((void*)src_v, 0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); + ST_UB2(dst0, dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int i; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + + for (i = 0; i < width; i += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_a); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_MSA(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11, vec12, vec13; + v8u16 const_256 = (v8u16)__msa_ldi_h(256); + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); + vec8 = (v8u16)__msa_fill_h(vec0[3]); + vec9 = (v8u16)__msa_fill_h(vec0[7]); + vec10 = (v8u16)__msa_fill_h(vec1[3]); + vec11 = (v8u16)__msa_fill_h(vec1[7]); + vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec10 = (v8u16)__msa_fill_h(vec2[3]); + vec11 = (v8u16)__msa_fill_h(vec2[7]); + vec12 = (v8u16)__msa_fill_h(vec3[3]); + vec13 = (v8u16)__msa_fill_h(vec3[7]); + vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); + vec8 = const_256 - vec8; + vec9 = const_256 - vec9; + vec10 = const_256 - vec10; + vec11 = const_256 - vec11; + vec8 *= vec4; + vec9 *= vec5; + vec10 *= vec6; + vec11 *= vec7; + vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); + vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); + vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); + vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst0 = (v16u8)__msa_adds_u_b(dst0, dst2); + dst1 = (v16u8)__msa_adds_u_b(dst1, dst3); + dst0 = __msa_bmnz_v(dst0, const_255, mask); + dst1 = __msa_bmnz_v(dst1, const_255, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v4i32 vec_scale = __msa_fill_w(scale); + v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); + v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); + v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48); + vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); + tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); + tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); + tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); + tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); + tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); + tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); + tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); + tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); + tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); + tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); + tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); + tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); + tmp0 *= vec_scale; + tmp1 *= vec_scale; + tmp2 *= vec_scale; + tmp3 *= vec_scale; + tmp4 *= vec_scale; + tmp5 *= vec_scale; + tmp6 *= vec_scale; + tmp7 *= vec_scale; + tmp8 *= vec_scale; + tmp9 *= vec_scale; + tmp10 *= vec_scale; + tmp11 *= vec_scale; + tmp12 *= vec_scale; + tmp13 *= vec_scale; + tmp14 *= vec_scale; + tmp15 *= vec_scale; + tmp0 >>= 16; + tmp1 >>= 16; + tmp2 >>= 16; + tmp3 >>= 16; + tmp4 >>= 16; + tmp5 >>= 16; + tmp6 >>= 16; + tmp7 >>= 16; + tmp8 >>= 16; + tmp9 >>= 16; + tmp10 >>= 16; + tmp11 >>= 16; + tmp12 >>= 16; + tmp13 >>= 16; + tmp14 >>= 16; + tmp15 >>= 16; + vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + dst0 *= vec_int_sz; + dst1 *= vec_int_sz; + dst2 *= vec_int_sz; + dst3 *= vec_int_sz; + dst0 += vec_int_ofst; + dst1 += vec_int_ofst; + dst2 += vec_int_ofst; + dst3 += vec_int_ofst; + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int32_t x; + v16i8 src0; + v16u8 src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + src0 = __msa_ld_b((void*)matrix_argb, 0); + vec0 = (v8i16)__msa_ilvr_b(zero, src0); + vec1 = (v8i16)__msa_ilvl_b(zero, src0); + + for (x = 0; x < width; x += 8) { + src1 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); + vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); + vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); + vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); + vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); + vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); + vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); + vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); + vec10 = vec2 * vec0; + vec11 = vec2 * vec1; + vec12 = vec6 * vec0; + vec13 = vec6 * vec1; + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + vec14 = vec3 * vec0; + vec15 = vec3 * vec1; + vec16 = vec7 * vec0; + vec17 = vec7 * vec1; + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + tmp0 = __msa_srai_w(tmp0, 6); + tmp1 = __msa_srai_w(tmp1, 6); + tmp2 = __msa_srai_w(tmp2, 6); + tmp3 = __msa_srai_w(tmp3, 6); + vec2 = vec4 * vec0; + vec6 = vec4 * vec1; + vec3 = vec8 * vec0; + vec7 = vec8 * vec1; + tmp8 = __msa_hadd_s_w(vec2, vec2); + tmp9 = __msa_hadd_s_w(vec6, vec6); + tmp10 = __msa_hadd_s_w(vec3, vec3); + tmp11 = __msa_hadd_s_w(vec7, vec7); + vec4 = vec5 * vec0; + vec8 = vec5 * vec1; + vec5 = vec9 * vec0; + vec9 = vec9 * vec1; + tmp12 = __msa_hadd_s_w(vec4, vec4); + tmp13 = __msa_hadd_s_w(vec8, vec8); + tmp14 = __msa_hadd_s_w(vec5, vec5); + tmp15 = __msa_hadd_s_w(vec9, vec9); + vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + tmp4 = __msa_srai_w(tmp4, 6); + tmp5 = __msa_srai_w(tmp5, 6); + tmp6 = __msa_srai_w(tmp6, 6); + tmp7 = __msa_srai_w(tmp7, 6); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec10 = __msa_maxi_s_h(vec10, 0); + vec11 = __msa_maxi_s_h(vec11, 0); + vec12 = __msa_maxi_s_h(vec12, 0); + vec13 = __msa_maxi_s_h(vec13, 0); + vec10 = __msa_min_s_h(vec10, max); + vec11 = __msa_min_s_h(vec11, max); + vec12 = __msa_min_s_h(vec12, max); + vec13 = __msa_min_s_h(vec13, max); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((void*)src_uv, 0); + src1 = (v16u8)__msa_ld_b((void*)src_uv, 16); + src2 = (v16u8)__msa_ld_b((void*)src_uv, 32); + src3 = (v16u8)__msa_ld_b((void*)src_uv, 48); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_u, 16); + ST_UB2(dst2, dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_b(v8); + + for (x = 0; x < width; x += 16) { + ST_UB(dst0, dst); + dst += 16; + } +} + +void MirrorSplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; + v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; + + src_uv += (2 * width); + + for (x = 0; x < width; x += 32) { + src_uv -= 64; + src2 = (v16u8)__msa_ld_b((void*)src_uv, 0); + src3 = (v16u8)__msa_ld_b((void*)src_uv, 16); + src0 = (v16u8)__msa_ld_b((void*)src_uv, 32); + src1 = (v16u8)__msa_ld_b((void*)src_uv, 48); + dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_v, 16); + ST_UB2(dst2, dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int32_t width) { + int x; + v16u8 src0, src1, src2, src3, src4, src5, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; + v16i8 tmp = __msa_ldi_b(8); + v16i8 mask1 = mask0 + tmp; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_y0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_y1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_y1, 16); + src4 = (v16u8)__msa_ld_b((void*)src_y2, 0); + src5 = (v16u8)__msa_ld_b((void*)src_y2, 16); + vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); + vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobelx); + src_y0 += 16; + src_y1 += 16; + src_y2 += 16; + dst_sobelx += 16; + } +} + +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int32_t width) { + int x; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_y1, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + vec0 -= vec2; + vec1 -= vec3; + vec6[0] = src_y0[16] - src_y1[16]; + vec6[1] = src_y0[17] - src_y1[17]; + vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); + vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); + vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); + vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobely); + src_y0 += 16; + src_y1 += 16; + dst_sobely += 16; + } +} + +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; + v4f32 mult_vec; + v8i16 zero = {0}; + mult_vec[0] = 1.9259299444e-34f * scale; + mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); + + for (i = 0; i < width; i += 32) { + src0 = (v8u16)__msa_ld_h((void*)src, 0); + src1 = (v8u16)__msa_ld_h((void*)src, 16); + src2 = (v8u16)__msa_ld_h((void*)src, 32); + src3 = (v8u16)__msa_ld_h((void*)src, 48); + vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); + vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); + vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); + vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); + vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); + vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); + vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); + vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); + fvec0 = __msa_ffint_u_w(vec0); + fvec1 = __msa_ffint_u_w(vec1); + fvec2 = __msa_ffint_u_w(vec2); + fvec3 = __msa_ffint_u_w(vec3); + fvec4 = __msa_ffint_u_w(vec4); + fvec5 = __msa_ffint_u_w(vec5); + fvec6 = __msa_ffint_u_w(vec6); + fvec7 = __msa_ffint_u_w(vec7); + fvec0 *= mult_vec; + fvec1 *= mult_vec; + fvec2 *= mult_vec; + fvec3 *= mult_vec; + fvec4 *= mult_vec; + fvec5 *= mult_vec; + fvec6 *= mult_vec; + fvec7 *= mult_vec; + vec0 = ((v4u32)fvec0) >> 13; + vec1 = ((v4u32)fvec1) >> 13; + vec2 = ((v4u32)fvec2) >> 13; + vec3 = ((v4u32)fvec3) >> 13; + vec4 = ((v4u32)fvec4) >> 13; + vec5 = ((v4u32)fvec5) >> 13; + vec6 = ((v4u32)fvec6) >> 13; + vec7 = ((v4u32)fvec7) >> 13; + dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); + dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + ST_UH2(dst0, dst1, dst, 8); + ST_UH2(dst2, dst3, dst + 16, 8); + src += 32; + dst += 32; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/source/row_neon.cc b/source/row_neon.cc new file mode 100644 index 00000000..4ed13638 --- /dev/null +++ b/source/row_neon.cc @@ -0,0 +1,3957 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are +// reserved. + +// q0: Y uint16x8_t +// d2: U uint8x8_t +// d3: V uint8x8_t + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.32 {d2[0]}, [%[src_u]]! \n" \ + "vld1.32 {d2[1]}, [%[src_v]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmovl.u8 q1, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsli.u16 q1, q1, #8 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_u]]! \n" \ + "vmovl.u8 q0, d0 \n" \ + "vld1.8 {d3}, [%[src_v]]! \n" \ + "vsli.u16 q0, q0, #8 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vmov.u8 q1, #128 \n" \ + "vmovl.u8 q0, d0 \n" \ + "vsli.u16 q0, q0, #8 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_uv]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \ + "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */ + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_vu]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \ + "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */ + +// Read 8 YUY2 +#define READYUY2 \ + "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \ + "vmovl.u8 q0, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vsli.u16 q0, q0, #8 \n" \ + "vsli.u16 d2, d2, #8 \n" \ + "vsri.u16 d3, d3, #8 \n" + +// Read 8 UYVY +#define READUYVY \ + "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \ + "vmovl.u8 q0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vsli.u16 q0, q0, #8 \n" \ + "vsli.u16 d2, d2, #8 \n" \ + "vsri.u16 d3, d3, #8 \n" + +// TODO: Use single register for kUVCoeff and multiply by lane +#define YUVTORGB_SETUP \ + "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \ + "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ + "vdup.u16 q10, d31[1] \n" \ + "vdup.u16 q11, d31[2] \n" \ + "vdup.u16 q12, d31[3] \n" \ + "vdup.u16 d31, d31[0] \n" + +// q0: B uint16x8_t +// q1: G uint16x8_t +// q2: R uint16x8_t + +// Convert from YUV to 2.14 fixed point RGB +#define YUVTORGB \ + "vmull.u16 q2, d1, d31 \n" \ + "vmull.u8 q8, d3, d29 \n" /* DGV */ \ + "vmull.u16 q0, d0, d31 \n" \ + "vmlal.u8 q8, d2, d28 \n" /* DG */ \ + "vqshrn.u32 d0, q0, #16 \n" \ + "vqshrn.u32 d1, q2, #16 \n" /* Y */ \ + "vmull.u8 q9, d2, d26 \n" /* DB */ \ + "vmull.u8 q2, d3, d27 \n" /* DR */ \ + "vadd.u16 q4, q0, q11 \n" /* G */ \ + "vadd.u16 q2, q0, q2 \n" /* R */ \ + "vadd.u16 q0, q0, q9 \n" /* B */ \ + "vqsub.u16 q1, q4, q8 \n" /* G */ \ + "vqsub.u16 q0, q0, q10 \n" /* B */ \ + "vqsub.u16 q2, q2, q12 \n" /* R */ + +// Convert from 2.14 fixed point RGB To 8 bit RGB +#define RGBTORGB8 \ + "vqshrn.u16 d4, q2, #6 \n" /* R */ \ + "vqshrn.u16 d2, q1, #6 \n" /* G */ \ + "vqshrn.u16 d0, q0, #6 \n" /* B */ + +#define YUVTORGB_REGS \ + "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31" + +#define STORERGBA \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d4 \n" \ + "vmov.u8 d0, d6 \n" \ + "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n" + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void I444ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void I444AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "vld1.8 {d6}, [%[src_a]]! \n" + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "vld1.8 {d6}, [%[src_a]]! \n" + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +#define ARGBTORGB565 \ + "vshll.u8 q2, d4, #8 \n" /* R */ \ + "vshll.u8 q1, d2, #8 \n" /* G */ \ + "vshll.u8 q0, d0, #8 \n" /* B */ \ + "vsri.16 q2, q1, #5 \n" /* RG */ \ + "vsri.16 q2, q0, #11 \n" /* RGB */ + +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 + "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +#define ARGBTOARGB1555 \ + "vshll.u8 q3, d6, #8 \n" /* A */ \ + "vshll.u8 q2, d4, #8 \n" /* R */ \ + "vshll.u8 q1, d2, #8 \n" /* G */ \ + "vshll.u8 q0, d0, #8 \n" /* B */ \ + "vsri.16 q3, q2, #1 \n" /* AR */ \ + "vsri.16 q3, q1, #6 \n" /* ARG */ \ + "vsri.16 q3, q0, #11 \n" /* ARGB */ + +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 + "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "q3"); +} + +#define ARGBTOARGB4444 \ + "vshr.u8 d0, d0, #4 \n" /* B */ \ + "vbic.32 d2, d2, d7 \n" /* G */ \ + "vshr.u8 d4, d4, #4 \n" /* R */ \ + "vbic.32 d6, d6, d7 \n" /* A */ \ + "vorr d0, d0, d2 \n" /* BG */ \ + "vorr d1, d4, d6 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "vmov.u8 d7, #0x0f \n" // vbic bits to clear + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "q3"); +} + +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUV400 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23"); +} + +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV21 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_vu] "+r"(src_vu), // %[src_vu] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV21 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_vu] "+r"(src_vu), // %[src_vu] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" ARGBTORGB565 + "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READUYVY YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 byte Y's from tile and writes out 16 Y's. +// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes +// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes +// width measured in bytes so 8 UV = 16. +void DetileRow_NEON(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes + "subs %2, %2, #16 \n" // 16 processed per loop + "pld [%0, #1792] \n" + "vst1.8 {q0}, [%1]! \n" // store 16 bytes + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "q0" // Clobber List + ); +} + +// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. +void DetileRow_16_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "pld [%0, #3584] \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride * 2) // %3 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d1}, [%0], %4 \n" + "subs %3, %3, #16 \n" + "pld [%0, #1792] \n" + "vst1.8 {d0}, [%1]! \n" + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(src_tile_stride) // %4 + : "cc", "memory", "d0", "d1" // Clobber List + ); +} + +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y + "pld [%0, #1792] \n" + "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV + "pld [%1, #1792] \n" + "subs %3, %3, #16 \n" + "vst2.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y + "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV + "subs %3, %3, #16 \n" + "pld [%0, #1792] \n" + "vzip.8 q0, q1 \n" + "pld [%1, #1792] \n" + "vst1.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list + ); +} +#endif + +void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { + asm volatile( + "1: \n" + "vld1.8 {q14}, [%0]! \n" // Load lower bits. + "vld1.8 {q9}, [%0]! \n" // Load upper bits row + // by row. + "vld1.8 {q11}, [%0]! \n" + "vld1.8 {q13}, [%0]! \n" + "vld1.8 {q15}, [%0]! \n" + "vshl.u8 q8, q14, #6 \n" // Shift lower bit data + // appropriately. + "vshl.u8 q10, q14, #4 \n" + "vshl.u8 q12, q14, #2 \n" + "vzip.u8 q8, q9 \n" // Interleave upper and + // lower bits. + "vzip.u8 q10, q11 \n" + "vzip.u8 q12, q13 \n" + "vzip.u8 q14, q15 \n" + "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits + // into lower 6 bits for + // better accuracy in + // conversions. + "vsri.u16 q9, q9, #10 \n" + "vsri.u16 q10, q10, #10 \n" + "vsri.u16 q11, q11, #10 \n" + "vsri.u16 q12, q12, #10 \n" + "vsri.u16 q13, q13, #10 \n" + "vsri.u16 q14, q14, #10 \n" + "vsri.u16 q15, q15, #10 \n" + "vstmia %1!, {q8-q15} \n" // Store pixel block (64 + // pixels). + "subs %2, %2, #80 \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(size) // %2 + : + : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. +void SplitARGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %5, %5, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "vst1.8 {q3}, [%4]! \n" // store A + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "vld1.8 {q3}, [%3]! \n" // load A + "subs %5, %5, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. +void SplitXRGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeXRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 q3, #255 \n" // load A(255) + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "vmov.u32 q14, #1023 \n" + "vdup.32 q15, %5 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vshl.u32 q2, q2, q15 \n" // 000B + "vshl.u32 q1, q1, q15 \n" + "vshl.u32 q0, q0, q15 \n" + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "vmov.u32 q14, #1023 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // 000B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + "3: \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q14"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vdup.u16 q15, %6 \n" + "vdup.u16 q14, %7 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vmin.u16 q3, q3, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "subs %5, %5, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%4]! \n" + "vst4.16 {d1, d3, d5, d7}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vmov.u8 q3, #0xff \n" // A (0xffff) + "vdup.u16 q15, %5 \n" + "vdup.u16 q14, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "subs %4, %4, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%3]! \n" + "vst4.16 {d1, d3, d5, d7}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "vqmovn.u16 d2, q2 \n" + "vqmovn.u16 d3, q3 \n" + "subs %5, %5, #8 \n" + "vst4.8 {d0, d1, d2, d3}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %5 \n" + "vmov.u8 d6, #0xff \n" // A (0xff) + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vqmovn.u16 d5, q2 \n" + "vqmovn.u16 d4, q1 \n" + "vqmovn.u16 d3, q0 \n" + "subs %4, %4, #8 \n" + "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); +} + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0"); +} + +// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0"); +} + +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %2 \n" + "sub %0, %0, #32 \n" // 32 bytes per loop + + "1: \n" + "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 + "subs %2, #32 \n" // 32 pixels per loop. + "vrev64.8 q0, q2 \n" + "vrev64.8 q1, q1 \n" + "vswp d0, d1 \n" + "vswp d2, d3 \n" + "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %2, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r12", "q0"); +} + +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "add %0, %0, %2, lsl #2 \n" + "sub %0, #32 \n" + + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vrev64.8 d3, d3 \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + src_rgb24 += width * 3 - 24; + asm volatile( + "1: \n" + "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"(-24) // %3 + : "cc", "memory", "d0", "d1", "d2"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + asm volatile( + "vmov.u8 d0, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgba), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3" // Clobber List + ); +} + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels. + "vst3.8 {d1, d3, d5}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_uyvy + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 q4, q1, q3 \n" // average rows of UV + "vst1.8 {q4}, [%2]! \n" // store 8 UV. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "d6"); +} + +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + asm volatile( + "vdup.32 d7, %2 \n" // dither4 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d7 \n" + "vqadd.u8 d2, d2, d7 \n" + "vqadd.u8 d4, d4, d7 \n" // add for dither + ARGBTORGB565 + "vst1.8 {q2}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width) { + asm volatile( + "vmov.u8 d7, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 + // coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + + "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned + "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned + + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", + "q15"); +} + +// clang-format off +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \ + "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */ +// clang-format on + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match Intel code. +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void RAWToUVJRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q3, q2, q1) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q0, q4, #1 \n" // 2x average + "vrshr.u16 q1, q5, #1 \n" + "vrshr.u16 q2, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vmov.u8 q1, q0 \n" + "vmov.u8 q3, q2 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "vld1.8 {q4}, [%3] \n" // shuffler + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vtbl.8 d2, {d0, d1}, d8 \n" + "vtbl.8 d3, {d0, d1}, d9 \n" + "vtbl.8 d6, {d4, d5}, d8 \n" + "vtbl.8 d7, {d4, d5}, d9 \n" + "vmov.u8 q0, q1 \n" + "vmov.u8 q2, q3 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToABGR) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vshrn.u16 d0, q0, #8 \n" + "vshrn.u16 d1, q1, #8 \n" + "vshrn.u16 d4, q2, #8 \n" + "vshrn.u16 d5, q3, #8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "vld1.8 {d8}, [%3] \n" // shuffler + + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vtbl.8 d0, {d0, d1}, d8 \n" + "vtbl.8 d1, {d2, d3}, d8 \n" + "vtbl.8 d4, {d4, d5}, d8 \n" + "vtbl.8 d5, {d6, d7}, d8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAB64ToARGB) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d0, d20 \n" // B + "vmull.u8 q9, d1, d20 \n" + "vmlal.u8 q8, d2, d21 \n" // G + "vmlal.u8 q9, d3, d21 \n" + "vmlal.u8 q8, d4, d22 \n" // R + "vmlal.u8 q9, d5, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d2, d20 \n" // B + "vmull.u8 q9, d3, d20 \n" + "vmlal.u8 q8, d4, d21 \n" // G + "vmlal.u8 q9, d5, d21 \n" + "vmlal.u8 q8, d6, d22 \n" // R + "vmlal.u8 q9, d7, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); +} + +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" + "1: \n" + "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of + // RGB24. + "vld3.8 {d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d2, d20 \n" // B + "vmull.u8 q9, d3, d20 \n" + "vmlal.u8 q8, d4, d21 \n" // G + "vmlal.u8 q9, d5, d21 \n" + "vmlal.u8 q8, d6, d22 \n" // R + "vmlal.u8 q9, d7, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); +} + +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); +} + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_16_NEON(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "cmp %4, #128 \n" + "beq 50f \n" + + "vdup.16 d17, %4 \n" + "vdup.16 d16, %5 \n" + // General purpose row blend. + "1: \n" + "vld1.16 {q0}, [%1]! \n" + "vld1.16 {q1}, [%2]! \n" + "subs %3, %3, #8 \n" + "vmull.u16 q2, d0, d16 \n" + "vmull.u16 q3, d1, d16 \n" + "vmlal.u16 q2, d2, d17 \n" + "vmlal.u16 q3, d3, d17 \n" + "vrshrn.u32 d0, q2, #8 \n" + "vrshrn.u32 d1, q3, #8 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.16 {q0}, [%1]! \n" + "vld1.16 {q1}, [%2]! \n" + "subs %3, %3, #8 \n" + "vrhadd.u16 q0, q1 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.16 {q0}, [%1]! \n" + "subs %3, %3, #8 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8"); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", + "q14", "q15"); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// %y passes a float as a scalar vector for vector * scalar multiply. +// the regoster must be d0 to d15 and indexed with [0] or [1] to access +// the float in the first or second float of the d-reg + +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "vmov.u16 d6, #4 \n" // constant 4 + "vmov.u16 d7, #6 \n" // constant 6 + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows + "vld1.16 {q2}, [%4]! \n" + "vaddl.u16 q0, d2, d4 \n" // * 1 + "vaddl.u16 q1, d3, d5 \n" // * 1 + "vld1.16 {q2}, [%1]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "vld1.16 {q2}, [%2]! \n" + "vmlal.u16 q0, d4, d7 \n" // * 6 + "vmlal.u16 q1, d5, d7 \n" // * 6 + "vld1.16 {q2}, [%3]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "subs %6, %6, #8 \n" // 8 processed per loop + "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples + "bgt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "vmov.u32 q10, #4 \n" // constant 4 + "vmov.u32 q11, #6 \n" // constant 6 + + "1: \n" + "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples + "vld1.32 {q2}, [%0] \n" + "vadd.u32 q0, q0, q1 \n" // * 1 + "vadd.u32 q1, q1, q2 \n" // * 1 + "vld1.32 {q2, q3}, [%2]! \n" + "vmla.u32 q0, q2, q11 \n" // * 6 + "vmla.u32 q1, q3, q11 \n" // * 6 + "vld1.32 {q2, q3}, [%1]! \n" + "vld1.32 {q8, q9}, [%3]! \n" + "vadd.u32 q2, q2, q8 \n" // add rows for * 4 + "vadd.u32 q3, q3, q9 \n" + "vmla.u32 q0, q2, q10 \n" // * 4 + "vmla.u32 q1, q3, q10 \n" // * 4 + "subs %5, %5, #8 \n" // 8 processed per loop + "vqshrn.u32 d0, q0, #8 \n" // round and pack + "vqshrn.u32 d1, q1, #8 \n" + "vst1.u16 {q0}, [%4]! \n" // store 8 samples + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); +} + +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + // pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + // pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + // pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d0, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +} + +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + // pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + // pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + // pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d1, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +} + +// Copy row of AYUV Y's into Y. +// Similar to ARGBExtractAlphaRow_NEON +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values + "vld2.8 {d1, d3}, [%0]! \n" + "vorr.u8 q2, q0, q0 \n" // move U after V + "subs %2, %2, #16 \n" // 16 pixels per loop + "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 U values + "vld1.8 {q1}, [%2]! \n" // load 16 V values + "vld1.8 {q2}, [%1]! \n" + "vld1.8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" // half size + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q1, q3 \n" + "vqrshrn.u16 d0, q0, #2 \n" + "vqrshrn.u16 d1, q1, #2 \n" + "subs %5, %5, #16 \n" // 16 src pixels per loop + "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = depth - 16; // Negative for right shift. + asm volatile( + "vdup.16 q2, %4 \n" + "1: \n" + "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst1.16 {q0}, [%1]! \n" // store 8 U pixels + "vst1.16 {q1}, [%2]! \n" // store 8 V pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "q0", "q1", "q2"); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "vdup.16 q2, %4 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" // load 8 U + "vld1.16 {q1}, [%1]! \n" // load 8 V + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "q0", "q1", "q2"); +} + +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q2, %3 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vmul.u16 q0, q0, q2 \n" + "vmul.u16 q1, q1, q2 \n" + "vst1.16 {q0}, [%1]! \n" + "vst1.16 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 d8, %3 \n" + "1: \n" + "vld1.16 {q2, q3}, [%0]! \n" + "vmull.u16 q0, d4, d8 \n" + "vmull.u16 q1, d5, d8 \n" + "vmull.u16 q2, d6, d8 \n" + "vmull.u16 q3, d7, d8 \n" + "vshrn.u32 d0, q0, #16 \n" + "vshrn.u32 d1, q1, #16 \n" + "vshrn.u32 d2, q2, #16 \n" + "vshrn.u32 d3, q3, #16 \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels + "subs %2, %2, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "d8"); +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits = shr 1 +// 16384 = 10 bits = shr 2 +// 4096 = 12 bits = shr 4 +// 256 = 16 bits = shr 8 +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + asm volatile( + "vdup.16 q2, %3 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative + "vshl.u16 q1, q1, q2 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(shift) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc new file mode 100644 index 00000000..74190d61 --- /dev/null +++ b/source/row_neon64.cc @@ -0,0 +1,4509 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer +// STn over ZIP1+ST1 +// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// v0.8h: Y +// v1.16b: 8U, 8V + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + "ldr d0, [%[src_y]], #8 \n" \ + "ld1 {v1.s}[0], [%[src_u]], #4 \n" \ + "ld1 {v1.s}[1], [%[src_v]], #4 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v1.16b, v1.16b, v1.16b \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + "ldr d0, [%[src_y]], #8 \n" \ + "ld1 {v1.d}[0], [%[src_u]], #8 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "ld1 {v1.d}[1], [%[src_v]], #8 \n" \ + "prfm pldl1keep, [%[src_u], 448] \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_v], 448] \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + "ldr d0, [%[src_y]], #8 \n" \ + "movi v1.16b, #128 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" + +static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, + 1, 1, 3, 3, 5, 5, 7, 7}; +static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, + 0, 0, 2, 2, 4, 4, 6, 6}; + +// Read 8 Y and 4 UV from NV12 or NV21 +#define READNV12 \ + "ldr d0, [%[src_y]], #8 \n" \ + "ldr d1, [%[src_uv]], #8 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" \ + "prfm pldl1keep, [%[src_uv], 448] \n" + +// Read 8 YUY2 +#define READYUY2 \ + "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_yuy2], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" + +// Read 8 UYVY +#define READUYVY \ + "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \ + "zip1 v0.16b, v4.16b, v4.16b \n" \ + "prfm pldl1keep, [%[src_uyvy], 448] \n" \ + "tbl v1.16b, {v3.16b}, v2.16b \n" + +// UB VR UG VG +// YG BB BG BR +#define YUVTORGB_SETUP \ + "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ + "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" + +// v16.8h: B +// v17.8h: G +// v18.8h: R + +// Convert from YUV to 2.14 fixed point RGB +#define YUVTORGB \ + "umull2 v3.4s, v0.8h, v24.8h \n" \ + "umull v6.8h, v1.8b, v30.8b \n" \ + "umull v0.4s, v0.4h, v24.4h \n" \ + "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ + "uqshrn v0.4h, v0.4s, #16 \n" \ + "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ + "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ + "add v17.8h, v0.8h, v26.8h \n" /* G */ \ + "add v16.8h, v0.8h, v4.8h \n" /* B */ \ + "add v18.8h, v0.8h, v5.8h \n" /* R */ \ + "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ + "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ + "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ + +// Convert from 2.14 fixed point RGB To 8 bit RGB +#define RGBTORGB8 \ + "uqshrn v17.8b, v17.8h, #6 \n" \ + "uqshrn v16.8b, v16.8h, #6 \n" \ + "uqshrn v18.8b, v18.8h, #6 \n" + +#define YUVTORGB_REGS \ + "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \ + "v26", "v27", "v28", "v29", "v30", "v31" + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" /* A */ + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void I444ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" /* A */ + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void I444AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 + "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 + "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v15.8b, #255 \n" /* A */ + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v15"); +} + +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +#define ARGBTORGB565 \ + "shll v18.8h, v18.8b, #8 \n" /* R */ \ + "shll v17.8h, v17.8b, #8 \n" /* G */ \ + "shll v16.8h, v16.8b, #8 \n" /* B */ \ + "sri v18.8h, v17.8h, #5 \n" /* RG */ \ + "sri v18.8h, v16.8h, #11 \n" /* RGB */ + +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565 + "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + +#define ARGBTOARGB1555 \ + "shll v0.8h, v19.8b, #8 \n" /* A */ \ + "shll v18.8h, v18.8b, #8 \n" /* R */ \ + "shll v17.8h, v17.8b, #8 \n" /* G */ \ + "shll v16.8h, v16.8b, #8 \n" /* B */ \ + "sri v0.8h, v18.8h, #1 \n" /* AR */ \ + "sri v0.8h, v17.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v16.8h, #11 \n" /* ARGB */ + +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +#define ARGBTOARGB4444 \ + /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \ + "ushr v16.8b, v16.8b, #4 \n" /* B */ \ + "bic v17.8b, v17.8b, v23.8b \n" /* G */ \ + "ushr v18.8b, v18.8b, #4 \n" /* R */ \ + "bic v19.8b, v19.8b, v23.8b \n" /* A */ \ + "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \ + "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v23.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "movi v19.8b, #255 \n" ARGBTOARGB4444 + "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 + // pixels + // ARGB4444. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19", "v23"); +} + +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV400 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +#if LIBYUV_USE_ST4 +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v23.8b, #255 \n" + "1: \n" + "ld1 {v20.8b}, [%0], #8 \n" + "prfm pldl1keep, [%0, 448] \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23"); +} +#else +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #255 \n" + "1: \n" + "ldr d16, [%0], #8 \n" + "subs %w2, %w2, #8 \n" + "zip1 v18.16b, v16.16b, v16.16b \n" // YY + "zip1 v19.16b, v16.16b, v20.16b \n" // YA + "prfm pldl1keep, [%0, 448] \n" + "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA + "zip2 v17.16b, v18.16b, v19.16b \n" + "stp q16, q17, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v16", "v17", "v18", "v19", "v20"); +} +#endif // LIBYUV_USE_ST4 + +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); +} + +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_vu), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV21Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); +} + +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_vu), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV21Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" ARGBTORGB565 + "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 + // pixels + // RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); +} + +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READUYVY YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// Reads 16 byte Y's from tile and writes out 16 Y's. +// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes +// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes +// width measured in bytes so 8 UV = 16. +void DetileRow_NEON(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead + "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "v0" // Clobber List + ); +} + +// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. +void DetileRow_16_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead + "st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride * 2) // %3 + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b,v1.8b}, [%0], %4 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%0, 1792] \n" + "st1 {v0.8b}, [%1], #8 \n" + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(src_tile_stride) // %4 + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUY2 +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "prfm pldl1keep, [%0, 1792] \n" + "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs + "prfm pldl1keep, [%1, 1792] \n" + "subs %w3, %w3, #16 \n" // store 8 YUY2 + "st2 {v0.16b,v1.16b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "v0", "v1" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUY2 +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%0, 1792] \n" + "zip1 v2.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 1792] \n" + "zip2 v3.16b, v0.16b, v1.16b \n" + "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2 + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list + ); +} +#endif + +// Unpack MT2T into tiled P010 64 pixels at a time. See +// tinyurl.com/mtk-10bit-video-format for format documentation. +void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { + asm volatile( + "1: \n" + "ld1 {v7.16b}, [%0], #16 \n" + "ld1 {v0.16b-v3.16b}, [%0], #64 \n" + "shl v4.16b, v7.16b, #6 \n" + "shl v5.16b, v7.16b, #4 \n" + "shl v6.16b, v7.16b, #2 \n" + "subs %2, %2, #80 \n" + "zip1 v16.16b, v4.16b, v0.16b \n" + "zip1 v18.16b, v5.16b, v1.16b \n" + "zip1 v20.16b, v6.16b, v2.16b \n" + "zip1 v22.16b, v7.16b, v3.16b \n" + "zip2 v17.16b, v4.16b, v0.16b \n" + "zip2 v19.16b, v5.16b, v1.16b \n" + "zip2 v21.16b, v6.16b, v2.16b \n" + "zip2 v23.16b, v7.16b, v3.16b \n" + "sri v16.8h, v16.8h, #10 \n" + "sri v17.8h, v17.8h, #10 \n" + "sri v18.8h, v18.8h, #10 \n" + "sri v19.8h, v19.8h, #10 \n" + "st1 {v16.8h-v19.8h}, [%1], #64 \n" + "sri v20.8h, v20.8h, #10 \n" + "sri v21.8h, v21.8h, #10 \n" + "sri v22.8h, v22.8h, #10 \n" + "sri v23.8h, v23.8h, #10 \n" + "st1 {v20.8h-v23.8h}, [%1], #64 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(size) // %2 + : + : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); +} + +#if LIBYUV_USE_ST2 +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "dup v2.8h, %w4 \n" + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // load 8 U + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v2.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); +} +#else +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "zip1 v2.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "zip2 v3.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "dup v4.8h, %w4 \n" + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // load 8 U + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "ushl v0.8h, v0.8h, v4.8h \n" + "ushl v1.8h, v1.8h, v4.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "zip1 v2.8h, v0.8h, v1.8h \n" + "zip2 v3.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4"); +} +#endif // LIBYUV_USE_ST2 + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. +void SplitARGBRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%3], #16 \n" // store B + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%1], #16 \n" // store R + "st1 {v3.16b}, [%4], #16 \n" // store A + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +#if LIBYUV_USE_ST4 +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%2], #16 \n" // load B + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%0], #16 \n" // load R + "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "prfm pldl1keep, [%3, 448] \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#else +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%2], #16 \n" // load B + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%0], #16 \n" // load R + "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%2, 448] \n" + "zip1 v4.16b, v0.16b, v1.16b \n" // BG + "zip1 v5.16b, v2.16b, v3.16b \n" // RA + "prfm pldl1keep, [%1, 448] \n" + "zip2 v6.16b, v0.16b, v1.16b \n" // BG + "zip2 v7.16b, v2.16b, v3.16b \n" // RA + "prfm pldl1keep, [%0, 448] \n" + "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA + "zip2 v1.8h, v4.8h, v5.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "zip1 v2.8h, v6.8h, v7.8h \n" + "zip2 v3.8h, v6.8h, v7.8h \n" + "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} +#endif // LIBYUV_USE_ST4 + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. +void SplitXRGBRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%3], #16 \n" // store B + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%1], #16 \n" // store R + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time +void MergeXRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.16b, #255 \n" // load A(255) + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v0.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "dup v31.4s, %w5 \n" + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "ushl v2.4s, v2.4s, v31.4s \n" // 000B + "ushl v1.4s, v1.4s, v31.4s \n" // G + "ushl v0.4s, v0.4s, v31.4s \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // 000B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "v0", "v1", "v2", "v30"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "dup v30.8h, %w7 \n" + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umin v3.8h, v3.8h, v30.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "movi v3.16b, #0xff \n" // A (0xffff) + "dup v30.8h, %w6 \n" + "dup v31.8h, %w5 \n" + + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "uqxtn v3.8b, v3.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w5 \n" + "movi v3.8b, #0xff \n" // A (0xff) + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +// Copy multiple of 32. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0"); +} + +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0"); +} + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "ld1 {v3.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q2, [%0, 16] \n" + "ldr q1, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #32 \n" // 32 pixels per loop. + "tbl v0.16b, {v2.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirror) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + asm volatile( + // Start at end of source row. + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorUV) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "ld1 {v4.16b}, [%4] \n" // shuffler + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w3, %w3, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "uzp1 v0.16b, v2.16b, v3.16b \n" // U + "uzp2 v1.16b, v2.16b, v3.16b \n" // V + "st1 {v0.16b}, [%1], #16 \n" // dst += 16 + "st1 {v1.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(&kShuffleMirrorUV) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +// Shuffle table for reversing the ARGB. +static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, + 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; + +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + // Start at end of source row. + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #8 \n" // 8 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "ld1 {v3.16b}, [%4] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #48 \n" + + "1: \n" + "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v0.16b, {v0.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "tbl v2.16b, {v2.16b}, v3.16b \n" + "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-48), // %3 + "r"(&kShuffleMirror) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of + // RGB24. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + asm volatile( + "movi v0.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v2.8b, v4.8b, v4.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" + "orr v1.8b, v5.8b, v5.8b \n" // move r + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgba), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ + +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b +// clobbers v3 +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" + +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "prfm pldl1keep, [%0, 448] \n" + "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "prfm pldl1keep, [%0, 448] \n" + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row + "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV + "prfm pldl1keep, [%0, 448] \n" + "st1 {v4.16b}, [%2], #16 \n" // store 8 UV. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "subs %w4, %w4, #16 \n" // 16 pixels + "orr v2.8b, v1.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 + // pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 + "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v16", "v17", "v18", "v19"); +} + +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + asm volatile( + "dup v1.4s, %w3 \n" // dither4 + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uqadd v16.8b, v16.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqadd v17.8b, v17.8b, v1.8b \n" + "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 + "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb), // %1 + "+r"(width) // %2 + : "r"(dither4) // %3 + : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); +} + +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width) { + asm volatile( + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 + // pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v16", "v17", "v18", "v19"); +} + +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width) { + asm volatile( + "movi v23.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 + // pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); +} + +#if LIBYUV_USE_ST2 +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "mov v1.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "mov v3.16b, v2.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "ldr q4, [%3] \n" // shuffler + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "mov v1.16b, v0.16b \n" + "mov v3.16b, v2.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToABGR) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} +#else +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "zip1 v2.16b, v0.16b, v0.16b \n" + "zip2 v3.16b, v0.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "zip1 v4.16b, v1.16b, v1.16b \n" + "zip2 v5.16b, v1.16b, v1.16b \n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} + +static const uvec8 kShuffleARGBToAB64[2] = { + {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7}, + {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}}; + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "ldp q6, q7, [%3] \n" // 2 shufflers + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64 + "tbl v3.16b, {v0.16b}, v7.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v4.16b, {v1.16b}, v6.16b \n" + "tbl v5.16b, {v1.16b}, v7.16b \n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToAB64[0]) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} +#endif // LIBYUV_USE_ST2 + +static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ldr q4, [%3] \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAR64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, + 21, 19, 17, 23, 29, 27, 25, 31}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ldr q4, [%3] \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAB64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movi v24.8b, #112 \n" // UB / VR 0.875 + // coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + + "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned + "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned + + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", + "v27", "v28", "v29"); +} + +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// clang-format off +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */ +// clang-format on + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +// TODO(fbarchard): consider ptrdiff_t for all strides. + +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +// TODO(fbarchard): Subsample match Intel code. +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_rgb24_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RAWToUVJRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_raw_1 = src_raw + src_stride_raw; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_raw_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_bgra_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v2.8h, v1.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_rgba_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_rgb24_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_raw_1 = src_raw + src_stride_raw; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_raw_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile( + RGBTOUV_SETUP_REG // sets v20-v25 + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28" + + ); +} + +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", + "v27"); +} + +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "ldr d0, [%3] \n" // load rgbconstants + "dup v6.16b, v0.b[0] \n" + "dup v7.16b, v0.b[1] \n" + "dup v16.16b, v0.b[2] \n" + "dup v17.8h, v0.h[2] \n" + "1: \n" + "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v6.8b \n" // B + "umull2 v1.8h, v2.16b, v6.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "umlal v0.8h, v3.8b, v7.8b \n" // G + "umlal2 v1.8h, v3.16b, v7.16b \n" + "umlal v0.8h, v4.8b, v16.8b \n" // R + "umlal2 v1.8h, v4.16b, v16.16b \n" + "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "ldr d0, [%3] \n" // load rgbconstants + "dup v6.16b, v0.b[0] \n" + "dup v7.16b, v0.b[1] \n" + "dup v16.16b, v0.b[2] \n" + "dup v17.8h, v0.h[2] \n" + "1: \n" + "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v6.8b \n" // B + "umull2 v1.8h, v2.16b, v6.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "umlal v0.8h, v3.8b, v7.8b \n" // G + "umlal2 v1.8h, v3.16b, v7.16b \n" + "umlal v0.8h, v4.8b, v16.8b \n" // R + "umlal2 v1.8h, v4.16b, v16.16b \n" + "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); +} + +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "ldr d0, [%3] \n" // load rgbconstants + "dup v5.16b, v0.b[0] \n" + "dup v6.16b, v0.b[1] \n" + "dup v7.16b, v0.b[2] \n" + "dup v16.8h, v0.h[2] \n" + "1: \n" + "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v5.8b \n" // B + "umull2 v1.8h, v2.16b, v5.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "umlal v0.8h, v3.8b, v6.8b \n" // G + "umlal2 v1.8h, v3.16b, v6.16b \n" + "umlal v0.8h, v4.8b, v7.8b \n" // R + "umlal2 v1.8h, v4.16b, v7.16b \n" + "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v16.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); +} + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_16_NEON(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.8h, %w4 \n" + "dup v4.8h, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "umull v2.4s, v0.4h, v4.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.4s, v0.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.4s, v1.4h, v5.4h \n" + "umlal2 v3.4s, v1.8h, v5.8h \n" + "rshrn v0.4h, v2.4s, #8 \n" + "rshrn2 v0.8h, v3.4s, #8 \n" + "st1 {v0.8h}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.8h}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v0.8h}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} + +// Bilinear filter 8x2 -> 8x1 +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + + asm volatile( + "dup v6.8h, %w6 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.8h, %w4 \n" + "dup v4.8h, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "umull v2.4s, v0.4h, v4.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.4s, v0.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.4s, v1.4h, v5.4h \n" + "umlal2 v3.4s, v1.8h, v5.8h \n" + "rshrn v0.4h, v2.4s, #8 \n" + "rshrn2 v0.8h, v3.4s, #8 \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ldr q0, [%1], #16 \n" + "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative + "prfm pldl1keep, [%1, 448] \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "str d0, [%0], #8 \n" // store 8 pixels + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction), // %5 + "r"(shift) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" + + "89: \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" + + // Blend 1 pixels. + "1: \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel + // ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel + // ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" + + "99: \n" + + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18"); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "prfm pldl1keep, [%0, 448] \n" + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add + + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movi v24.8b, #29 \n" // B * 0.1140 coefficient + "movi v25.8b, #150 \n" // G * 0.5870 coefficient + "movi v26.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 + +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "prfm pldl1keep, [%0, 448] \n" + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "prfm pldl1keep, [%0, 448] \n" + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v22", "v23", "v24", "v25"); +} + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "prfm pldl1keep, [%0, 448] \n" + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "prfm pldl1keep, [%1, 448] \n" + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "prfm pldl1keep, [%0, 448] \n" + "orr v1.8b, v0.8b, v0.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "prfm pldl1keep, [%1, 448] \n" + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1"); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "prfm pldl1keep, [%1, 448] \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "prfm pldl1keep, [%2, 448] \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2LL), // %5 + "r"(6LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "add v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1LL), // %4 + "r"(6LL) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Caveat - rounds float to half float whereas scaling version truncates. +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "prfm pldl1keep, [%0, 448] \n" + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fmax; +} + +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fsum; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fsum) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; +} + +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { + asm volatile( + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "prfm pldl1keep, [%0, 448] \n" + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%1, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "prfm pldl1keep, [%2, 448] \n" + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%3, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "prfm pldl1keep, [%4, 448] \n" + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : "r"(32LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_NEON(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + asm volatile( + "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 + + "1: \n" + "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows + "ld1 {v2.4s, v3.4s}, [%1], #32 \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%2], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%3], #32 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "prfm pldl1keep, [%1, 448] \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%4], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%2, 448] \n" + "fadd v0.4s, v0.4s, v4.4s \n" // * 1 + "prfm pldl1keep, [%3, 448] \n" + "fadd v1.4s, v1.4s, v5.4s \n" + "prfm pldl1keep, [%4, 448] \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : "r"(&kGaussCoefficients) // %7 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_F32_NEON(const float* src, float* dst, int width) { + asm volatile( + "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 + + "1: \n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 + // rows + "fadd v0.4s, v0.4s, v1.4s \n" // * 1 + "ld1 {v4.4s, v5.4s}, [%0], %5 \n" + "fadd v1.4s, v1.4s, v2.4s \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%0], %4 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "ld1 {v4.4s, v5.4s}, [%0], %6 \n" + "fadd v2.4s, v2.4s, v4.4s \n" + "fadd v3.4s, v3.4s, v5.4s \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmul v0.4s, v0.4s, v8.4s \n" // / 256 + "fmul v1.4s, v1.4s, v8.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kGaussCoefficients), // %3 + "r"(8LL), // %4 + "r"(-4LL), // %5 + "r"(20LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); +} + +#if LIBYUV_USE_ST3 +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "prfm pldl1keep, [%0, 448] \n" + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2"); +} +#else +static const uvec8 kYUV24Shuffle[3] = { + {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20}, + {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27}, + {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}}; + +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values + "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values + "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24 + "prfm pldl1keep, [%0, 448] \n" + "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n" + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} +#endif // LIBYUV_USE_ST3 + +// Note ST2 8b version is faster than zip+ST1 + +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Copy row of AYUV Y's into Y +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// Shuffle table for swapping UV bytes. +static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; + +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values + "ld1 {v1.16b}, [%0], 16 \n" + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "tbl v0.16b, {v0.16b}, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v1.16b, {v1.16b}, v2.16b \n" + "stp q0, q1, [%1], 32 \n" // store 16 VU pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "r"(&kShuffleSwapUV) // %3 + : "cc", "memory", "v0", "v1", "v2"); +} + +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values + "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values + "ld1 {v2.16b}, [%1], #16 \n" + "ld1 {v3.16b}, [%3], #16 \n" + "uaddlp v0.8h, v0.16b \n" // half size + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v3.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w5, %w5, #16 \n" // 16 src pixels per loop + "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = depth - 16; // Negative for right shift. + asm volatile( + "dup v2.8h, %w4 \n" + "1: \n" + "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v2.8h \n" + "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels + "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); +} + +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v2.8h, %w3 \n" + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "mul v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "mul v1.8h, v1.8h, v2.8h \n" + "stp q0, q1, [%1], #32 \n" // store 16 pixels + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "v0", "v1", "v2"); +} + +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v4.8h, %w3 \n" + "1: \n" + "ldp q2, q3, [%0], #32 \n" + "umull v0.4s, v2.4h, v4.4h \n" + "umull2 v1.4s, v2.8h, v4.8h \n" + "umull v2.4s, v3.4h, v4.4h \n" + "umull2 v3.4s, v3.8h, v4.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "shrn v0.4h, v0.4s, #16 \n" + "shrn2 v0.8h, v1.4s, #16 \n" + "shrn v1.4h, v2.4s, #16 \n" + "shrn2 v1.8h, v3.4s, #16 \n" + "stp q0, q1, [%1], #32 \n" // store 16 pixels + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits = shr 1 +// 16384 = 10 bits = shr 2 +// 4096 = 12 bits = shr 4 +// 256 = 16 bits = shr 8 +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + asm volatile( + "dup v2.8h, %w3 \n" + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative + "ushl v1.8h, v1.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "uqxtn v0.8b, v0.8h \n" + "uqxtn2 v0.16b, v1.8h \n" + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "str q0, [%1], #16 \n" // store 16 pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(shift) // %3 + : "cc", "memory", "v0", "v1", "v2"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_rvv.cc b/source/row_rvv.cc new file mode 100644 index 00000000..27e91a3b --- /dev/null +++ b/source/row_rvv.cc @@ -0,0 +1,956 @@ +/* + * Copyright 2023 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * Contributed by Darren Hsieh + * Contributed by Bruce Lai + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#include +#include + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Fill YUV -> RGB conversion constants into vectors +// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode +// register) is set to round-to-nearest-up mode(0). +#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + asm volatile("csrwi vxrm, 0"); \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ + } + +// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 +#define READYUV422(vl, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 +#define READYUV444(vl, v_u, v_v, v_y_16) \ + { \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Convert from YUV to fixed point RGB +#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \ + v_b_16, v_r_16) \ + { \ + vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ + vuint32m8_t v_tmp5; \ + v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \ + v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \ + v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \ + v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \ + v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \ + v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \ + v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \ + v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \ + v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \ + v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \ + v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \ + v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \ + } + +// Convert from fixed point RGB To 8 bit RGB +#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ + { \ + v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \ + v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \ + v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ + } + +void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + size_t avl = (size_t)4 * width; + do { + vuint16m8_t v_ar64; + vuint8m4_t v_argb; + size_t vl = __riscv_vsetvl_e8m4(avl); + v_argb = __riscv_vle8_v_u8m4(src_argb, vl); + v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl); + v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl); + __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl); + avl -= vl; + src_argb += vl; + dst_ar64 += vl; + } while (avl > 0); +} + +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m1(avl); + __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); + v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); + v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); + v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); + v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); + v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); + v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); + v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl); + avl -= vl; + src_argb += 4 * vl; + dst_ab64 += 4 * vl; + } while (avl > 0); +} + +void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)4 * width; + do { + vuint16m8_t v_ar64; + vuint8m4_t v_argb; + size_t vl = __riscv_vsetvl_e16m8(avl); + v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl); + v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl); + __riscv_vse8_v_u8m4(dst_argb, v_argb, vl); + avl -= vl; + src_ar64 += vl; + dst_argb += vl; + } while (avl > 0); +} + +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e16m2(avl); + __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl); + v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); + v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + avl -= vl; + src_ab64 += 4 * vl; + dst_argb += 4 * vl; + } while (avl > 0); +} + +void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_raw += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_raw += vl * 3; + dst_rgba += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); + w -= vl; + src_raw += vl * 3; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); + w -= vl; + src_argb += vl * 4; + dst_raw += vl * 3; + } while (w > 0); +} + +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_argb += vl * 4; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_rgb24 += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void I444ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV444(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void I444ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} + +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} + +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgba += vl * 4; + } while (w > 0); +} + +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void I400ToARGBRow_RVV(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + vuint16m4_t v_yb; + vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) sets to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + if (is_yb_positive) { + v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); + } else { + v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl); + } + do { + vuint8m2_t v_y, v_out; + vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2; + vl = __riscv_vsetvl_e8m2(w); + v_y = __riscv_vle8_v_u8m2(src_y, vl); + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); + v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y + v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl); + if (is_yb_positive) { + v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl); + } else { + v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl); + } + v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_y; + v_y = __riscv_vle8_v_u8m2(src_y, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl); + __riscv_vse8_v_u8m8(dst, v_data, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} + +// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + size_t dst_w = (size_t)dst_width; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + // Blend 100 / 0 - Copy row unchanged. + if (y1_fraction == 0) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); + dst_w -= vl; + src_ptr += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // Blend 50 / 50. + if (y1_fraction == 128) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); + vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); + // Averaging add + vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); + __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // General purpose row blend. + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + size_t vl = __riscv_vsetvl_e8m4(dst_w); + vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); + vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); + vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); + acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); +} + +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_rgb += vl * 3; + } while (w > 0); +} + +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_rgb += vl * 3; + } while (w > 0); +} + +void SplitARGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_a += vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} + +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + src_a += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void SplitXRGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} + +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_r, v_g, v_b; + v_r = __riscv_vle8_v_u8m2(src_r, vl); + v_g = __riscv_vle8_v_u8m2(src_g, vl); + v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u, v_v; + __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} + +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + vuint8m4_t v_u, v_v; + size_t vl = __riscv_vsetvl_e8m4(w); + v_u = __riscv_vle8_v_u8m4(src_u, vl); + v_v = __riscv_vle8_v_u8m4(src_v, vl); + __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_argb += 4 * vl; + dst_y += vl; + } while (w > 0); +} + +void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgba += 4 * vl; + dst_y += vl; + } while (w > 0); +} + +void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgb += 3 * vl; + dst_y += vl; + } while (w > 0); +} + +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); +} + +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_ba_16, v_ga_16, v_ra_16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); + v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); + v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); + v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) diff --git a/source/row_win.cc b/source/row_win.cc new file mode 100644 index 00000000..5fb28521 --- /dev/null +++ b/source/row_win.cc @@ -0,0 +1,6440 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +// This module is for Visual C 32/64 bit +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) + +#if defined(_M_ARM64EC) +#include +#elif defined(_M_X64) +#include +#include // For _mm_maddubs_epi16 +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// 64 bit +#if defined(_M_X64) + +// Read 8 UV from 444 +#define READYUV444 \ + xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ + xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + u_buf += 8; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; + +// Read 8 UV from 444, With 8 Alpha. +#define READYUVA444 \ + xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ + xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + u_buf += 8; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 \ + xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB(yuvconstants) \ + xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ + xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ + xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ + xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ + xmm0 = _mm_adds_epi16(xmm4, xmm0); \ + xmm1 = _mm_subs_epi16(xmm4, xmm1); \ + xmm2 = _mm_adds_epi16(xmm4, xmm2); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); + +// Store 8 ARGB values. +#define STOREARGB \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ + _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ + _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ + dst_argb += 32; + +#if defined(HAS_I422TOARGBROW_SSSE3) +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + const __m128i xmm5 = _mm_set1_epi8(-1); + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUV422 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUVA422 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +#if defined(HAS_I444TOARGBROW_SSSE3) +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + const __m128i xmm5 = _mm_set1_epi8(-1); + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUV444 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUVA444 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +// 32 bit +#else // defined(_M_X64) +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constants for ARGB. +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; + +// JPeg full range. +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; + +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; + +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; + +static const vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; + +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; + +// Constants for BGRA. +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; + +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; + +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; + +// Constants for ABGR. +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; + +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; + +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; + +// Constants for RGBA. +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; + +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; + +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; + +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; + +// 7 bit fixed point 0.5. +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; + +// 8 bit fixed point 0.5, for bias of UV. +static const ulvec8 kBiasUV128 = { + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; + +// Shuffle table for converting RAW to RGB24. First 8. +static const uvec8 kShuffleMaskRAWToRGB24_0 = { + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Middle 8. +static const uvec8 kShuffleMaskRAWToRGB24_1 = { + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleMaskRAWToRGB24_2 = { + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; + +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, +}; + +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +#ifdef HAS_J400TOARGBROW_AVX2 +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + vpunpckhwd ymm1, ymm0, ymm0 + vpunpcklwd ymm0, ymm0, ymm0 + vpor ymm0, ymm0, ymm5 + vpor ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_J400TOARGBROW_AVX2 + +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqu [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqu [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqu [edx + 16], xmm1 + por xmm3, xmm5 + movdqu [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqu [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqu [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqu [edx + 16], xmm1 + por xmm3, xmm5 + movdqu [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_rgb24 + mov ecx, [esp + 12] // width + movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 + movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 + movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 4] + movdqu xmm2, [eax + 8] + lea eax, [eax + 24] + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + ret + } +} + +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions. +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +#ifdef HAS_RGB565TOARGBROW_AVX2 +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpsllw ymm4, ymm4, 10 + vpsrlw ymm4, ymm4, 5 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_RGB565TOARGBROW_AVX2 + +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpand ymm1, ymm1, ymm3 + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpand ymm2, ymm2, ymm7 + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB1555TOARGBROW_AVX2 + +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles + vpsrlw ymm3, ymm2, 4 + vpsllw ymm1, ymm0, 4 + vpor ymm2, ymm2, ymm3 + vpor ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm2, ymm2, 0xd8 + vpunpckhbw ymm1, ymm0, ymm2 + vpunpcklbw ymm0, ymm0, ymm2 + vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB4444TOARGBROW_AVX2 + +// 24 instructions +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 18 instructions. +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + movd xmm4, eax + pshufd xmm4, xmm4, 0 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + pslld xmm5, 4 + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqa xmm2, xmm0 + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + psllw xmm1, 4 + psrlw xmm3, 4 + por xmm0, xmm1 + por xmm2, xmm3 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 + + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW + + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + __asm { + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // width + punpcklbw xmm6, xmm6 // make dither 16 bytes + movdqa xmm7, xmm6 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + vbroadcastss xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // width + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + vpermq ymm6, ymm6, 0xd8 + vpunpcklwd ymm6, ymm6, ymm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + +// TODO(fbarchard): Improve sign extension/packing. +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + psrld xmm4, 27 + movdqa xmm5, xmm4 // generate mask 0x000003e0 + pslld xmm5, 5 + movdqa xmm6, xmm4 // generate mask 0x00007c00 + pslld xmm6, 10 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pslld xmm7, 15 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrld xmm0, 4 + psrld xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565ROW_AVX2 +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565ROW_AVX2 + +#ifdef HAS_ARGBTOARGB1555ROW_AVX2 +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm4, ymm4, ymm4 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpslld ymm7, ymm7, 15 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB1555ROW_AVX2 + +#ifdef HAS_ARGBTOARGB4444ROW_AVX2 +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpsllw ymm4, ymm4, 12 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble + vpsrld ymm1, ymm1, 8 + vpsrld ymm0, ymm0, 4 + vpor ymm0, ymm0, ymm1 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB4444ROW_AVX2 + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToYJ + movdqa xmm5, xmmword ptr kAddYJ64 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 // Add .5 for rounding. + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + vbroadcastf128 ymm4, xmmword ptr kARGBToY + vbroadcastf128 ymm5, xmmword ptr kAddY16 + vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 // add 16 for Y + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + vbroadcastf128 ymm4, xmmword ptr kARGBToYJ + vbroadcastf128 ymm5, xmmword ptr kAddYJ64 + vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. + vpaddw ymm2, ymm2, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYJROW_AVX2 + +__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kBGRAToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kABGRToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kRGBAToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kBiasUV128 + movdqa xmm6, xmmword ptr kARGBToV + movdqa xmm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kBiasUV128 + movdqa xmm6, xmmword ptr kARGBToVJ + movdqa xmm7, xmmword ptr kARGBToUJ + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToV + vbroadcastf128 ymm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_AVX2 +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToVJ + vbroadcastf128 ymm7, xmmword ptr kARGBToUJ + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned + vpaddw ymm0, ymm0, ymm5 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw + + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVJROW_AVX2 + +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm5, xmmword ptr kBiasUV128 + movdqa xmm6, xmmword ptr kARGBToV + movdqa xmm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* convert to U and V */ + movdqu xmm0, [eax] // U + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + + movdqu xmm0, [eax] // V + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqu [edx + edi], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kBiasUV128 + movdqa xmm6, xmmword ptr kBGRAToV + movdqa xmm7, xmmword ptr kBGRAToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kBiasUV128 + movdqa xmm6, xmmword ptr kABGRToV + movdqa xmm7, xmmword ptr kABGRToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kBiasUV128 + movdqa xmm6, xmmword ptr kRGBAToV + movdqa xmm7, xmmword ptr kRGBAToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOYROW_SSSE3 + +// Read 16 UV from 444 +#define READYUV444_AVX2 \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 16 UV from 444. With 16 Alpha. +#define READYUVA444_AVX2 \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vpermq ymm5, ymm5, 0xd8 \ + __asm lea ebp, [ebp + 16]} + +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + __asm { \ + __asm vmovq xmm3, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 \ + __asm { \ + __asm vmovq xmm3, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vpermq ymm5, ymm5, 0xd8 \ + __asm lea ebp, [ebp + 16]} + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 UV from NV21, upsample to 16 UV. +#define READNV21_AVX2 \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. +#define READYUY2_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ + __asm vmovdqu ymm3, [eax] /* UV */ \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 32]} + +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. +#define READUYVY_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ + __asm vmovdqu ymm3, [eax] /* UV */ \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 32]} + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2(YuvConstants) \ + __asm { \ + __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ + __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ + __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ + __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ + __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ + __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ + __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ + __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ + __asm vpaddw ymm4, ymm3, ymm4 \ + __asm vpaddsw ymm0, ymm0, ymm4 \ + __asm vpsubsw ymm1, ymm4, ymm1 \ + __asm vpaddsw ymm2, ymm2, ymm4 \ + __asm vpsraw ymm0, ymm0, 6 \ + __asm vpsraw ymm1, ymm1, 6 \ + __asm vpsraw ymm2, ymm2, 6 \ + __asm vpackuswb ymm0, ymm0, ymm0 \ + __asm vpackuswb ymm1, ymm1, ymm1 \ + __asm vpackuswb ymm2, ymm2, ymm2} + +// Store 16 ARGB values. +#define STOREARGB_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vmovdqu 0[edx], ymm1 \ + __asm vmovdqu 32[edx], ymm0 \ + __asm lea edx, [edx + 64]} + +// Store 16 RGBA values. +#define STORERGBA_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vmovdqu [edx], ymm0 \ + __asm vmovdqu [edx + 32], ymm1 \ + __asm lea edx, [edx + 64]} + +#ifdef HAS_I422TOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void I422ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOARGBROW_AVX2 + +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +__declspec(naked) void I422AlphaToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA422_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422ALPHATOARGBROW_AVX2 + +#ifdef HAS_I444TOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void I444ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + convertloop: + READYUV444_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444TOARGBROW_AVX2 + +#ifdef HAS_I444ALPHATOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void I444AlphaToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + convertloop: + READYUVA444_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444AlphaTOARGBROW_AVX2 + +#ifdef HAS_NV12TOARGBROW_AVX2 +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void NV12ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV12_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop esi + vzeroupper + ret + } +} +#endif // HAS_NV12TOARGBROW_AVX2 + +#ifdef HAS_NV21TOARGBROW_AVX2 +// 16 pixels. +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void NV21ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV21_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop esi + vzeroupper + ret + } +} +#endif // HAS_NV21TOARGBROW_AVX2 + +#ifdef HAS_YUY2TOARGBROW_AVX2 +// 16 pixels. +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +__declspec(naked) void YUY2ToARGBRow_AVX2( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUY2_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + vzeroupper + ret + } +} +#endif // HAS_YUY2TOARGBROW_AVX2 + +#ifdef HAS_UYVYTOARGBROW_AVX2 +// 16 pixels. +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +__declspec(naked) void UYVYToARGBRow_AVX2( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READUYVY_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + vzeroupper + ret + } +} +#endif // HAS_UYVYTOARGBROW_AVX2 + +#ifdef HAS_I422TORGBAROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +__declspec(naked) void I422ToRGBARow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // abgr + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(ebx) + STORERGBA_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TORGBAROW_AVX2 + +#if defined(HAS_I422TOARGBROW_SSSE3) +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. +// Allows a conversion with half size scaling. + +// Read 8 UV from 444. +#define READYUV444 \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* U */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 UV from 444. With 8 Alpha. +#define READYUVA444 \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* U */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 \ + __asm { \ + __asm movd xmm3, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + __asm { \ + __asm movd xmm3, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} + +// Read 4 UV from NV12, upsample to 8 UV. +#define READNV12 \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* UV */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 VU from NV21, upsample to 8 UV. +#define READNV21 \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* UV */ \ + __asm lea esi, [esi + 8] \ + __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. +#define READYUY2 \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ + __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ + __asm movdqu xmm3, [eax] /* UV */ \ + __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 16]} + +// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. +#define READUYVY \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ + __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ + __asm movdqu xmm3, [eax] /* UV */ \ + __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 16]} + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB(YuvConstants) \ + __asm { \ + __asm psubb xmm3, xmmword ptr kBiasUV128 \ + __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ + __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ + __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ + __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ + __asm pmaddubsw xmm0, xmm3 \ + __asm pmaddubsw xmm1, xmm3 \ + __asm pmaddubsw xmm2, xmm3 \ + __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ + __asm paddw xmm4, xmm3 \ + __asm paddsw xmm0, xmm4 \ + __asm paddsw xmm2, xmm4 \ + __asm psubsw xmm4, xmm1 \ + __asm movdqa xmm1, xmm4 \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Store 8 ARGB values. +#define STOREARGB \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm movdqu 0[edx], xmm0 \ + __asm movdqu 16[edx], xmm1 \ + __asm lea edx, [edx + 32]} + +// Store 8 BGRA values. +#define STOREBGRA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ + __asm lea edx, [edx + 32]} + +// Store 8 RGBA values. +#define STORERGBA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ + __asm lea edx, [edx + 32]} + +// Store 8 RGB24 values. +#define STORERGB24 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24]} + +// Store 8 RGB565 values. +#define STORERGB565 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ + __asm packssdw xmm0, xmm1 \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16]} + +// 8 pixels. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void I444ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV444 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes). +__declspec(naked) void I444AlphaToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA444 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). +__declspec(naked) void I422ToRGB24Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 + + convertloop: + READYUV422 + YUVTORGB(ebx) + STORERGB24 + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes). +__declspec(naked) void I444ToRGB24Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 + + convertloop: + READYUV444 + YUVTORGB(ebx) + STORERGB24 + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). +__declspec(naked) void I422ToRGB565Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb565_buf, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + psrld xmm5, 27 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + psrld xmm6, 26 + pslld xmm6, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pslld xmm7, 11 + + convertloop: + READYUV422 + YUVTORGB(ebx) + STORERGB565 + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void I422ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV422 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. +__declspec(naked) void I422AlphaToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA422 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void NV12ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READNV12 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void NV21ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READNV21 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop esi + ret + } +} + +// 8 pixels. +// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). +__declspec(naked) void YUY2ToARGBRow_SSSE3( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUY2 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + ret + } +} + +// 8 pixels. +// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). +__declspec(naked) void UYVYToARGBRow_SSSE3( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READUYVY + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + ret + } +} + +__declspec(naked) void I422ToRGBARow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + + convertloop: + READYUV422 + YUVTORGB(ebx) + STORERGBA + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} +#endif // HAS_I422TOARGBROW_SSSE3 + +// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter +#ifdef HAS_I400TOARGBROW_SSE2 +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). +__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* rgb_buf, + const struct YuvConstants*, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + movd xmm2, eax + pshufd xmm2, xmm2,0 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + movd xmm3, eax + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 // Y.Y + pmulhuw xmm0, xmm2 + psubusw xmm0, xmm3 + psrlw xmm0, 6 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm0, xmm4 + por xmm1, xmm4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_I400TOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* rgb_buf, + const struct YuvConstants*, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + vmovd xmm2, eax + vbroadcastss ymm2, xmm2 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + vmovd xmm3, eax + vbroadcastss ymm3, xmm3 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpslld ymm4, ymm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpmulhuw ymm0, ymm0, ymm2 + vpsubusw ymm0, ymm0, ymm3 + vpsrlw ymm0, ymm0, 6 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + vpermq ymm1, ymm1, 0xd8 + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm4 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_I400TOARGBROW_AVX2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +// TODO(fbarchard): Replace lea with -16 offset. +__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, xmmword ptr kShuffleMirror + + convertloop: + movdqu xmm0, [eax - 16 + ecx] + pshufb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vbroadcastf128 ymm5, xmmword ptr kShuffleMirror + + convertloop: + vmovdqu ymm0, [eax - 32 + ecx] + vpshufb ymm0, ymm0, ymm5 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + +__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, xmmword ptr kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm1 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MIRRORSPLITUVROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. + + convertloop: + movdqu xmm0, [eax] + lea eax, [eax - 16] + pshufd xmm0, xmm0, 0x1b + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + ret + } +} +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 + + convertloop: + vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [edx], xmm0 + movdqu [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_SPLITUVROW_AVX2 +__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqu [edx], ymm0 + vmovdqu [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // read 16 U's + movdqu xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqu [edi], xmm0 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + convertloop: + vpmovzxbw ymm0, [eax] + vpmovzxbw ymm1, [eax + edx] + lea eax, [eax + 16] + vpsllw ymm1, ymm1, 8 + vpor ymm2, ymm1, ymm0 + vmovdqu [edi], ymm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) void CopyRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + test eax, 15 + jne convertloopu + test edx, 15 + jne convertloopu + + convertloopa: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloopa + ret + + convertloopu: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloopu + ret + } +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX +// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) void CopyRow_AVX(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 64 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_COPYROW_AVX + +// Multiple of 1. +__declspec(naked) void CopyRow_ERMS(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // width + rep movsb + mov edi, edx + mov esi, eax + ret + } +} + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + convertloop: + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + + extractloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrld xmm0, 24 + psrld xmm1, 24 + packssdw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg extractloop + + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX + + extractloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpsrld ymm0, ymm0, 24 + vpsrld ymm1, ymm1, 24 + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + lea eax, [eax + 128] + vpackssdw ymm0, ymm0, ymm1 // mutates + vpsrld ymm2, ymm2, 24 + vpsrld ymm3, ymm3, 24 + vpackssdw ymm2, ymm2, ymm3 // mutates + vpackuswb ymm0, ymm0, ymm2 // mutates + vpermd ymm0, ymm4, ymm0 // unmutate + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg extractloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + convertloop: + movq xmm2, qword ptr [eax] // 8 Y's + lea eax, [eax + 8] + punpcklbw xmm2, xmm2 + punpckhwd xmm3, xmm2 + punpcklwd xmm2, xmm2 + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + convertloop: + vpmovzxbd ymm1, qword ptr [eax] + vpmovzxbd ymm2, qword ptr [eax + 8] + lea eax, [eax + 16] + vpslld ymm1, ymm1, 24 + vpslld ymm2, ymm2, 24 + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +// Write 'width' bytes using an 8 bit value repeated. +// width should be multiple of 4. +__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { + __asm { + movzx eax, byte ptr [esp + 8] // v8 + mov edx, 0x01010101 // Duplicate byte to all bytes. + mul edx // overwrites edx with upper part of result. + mov edx, edi + mov edi, [esp + 4] // dst + mov ecx, [esp + 12] // width + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// Write 'width' bytes using an 8 bit value repeated. +__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // width + rep stosb + mov edi, edx + ret + } +} + +// Write 'width' 32 bit values. +__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, + uint32_t v32, + int width) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // width + rep stosd + mov edi, edx + ret + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_AVX2 +__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} + +__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_YUY2TOYROW_SSE2 +__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_BLENDPLANEROW_SSSE3 +// Blend 8 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + __asm { + push esi + push edi + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm6, eax + pshufd xmm6, xmm6, 0x00 + + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + movd xmm7, eax + pshufd xmm7, xmm7, 0x00 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 8 pixel loop. + convertloop8: + movq xmm0, qword ptr [esi] // alpha + punpcklbw xmm0, xmm0 + pxor xmm0, xmm5 // a, 255-a + movq xmm1, qword ptr [eax + esi] // src0 + movq xmm2, qword ptr [edx + esi] // src1 + punpcklbw xmm1, xmm2 + psubb xmm1, xmm6 // bias src0/1 - 128 + pmaddubsw xmm0, xmm1 + paddw xmm0, xmm7 // unbias result - 32768 and round. + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edi + esi], xmm0 + lea esi, [esi + 8] + sub ecx, 8 + jg convertloop8 + + pop edi + pop esi + ret + } +} +#endif // HAS_BLENDPLANEROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + __asm { + push esi + push edi + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpsllw ymm5, ymm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + vmovd xmm7, eax + vbroadcastss ymm7, xmm7 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 32 pixel loop. + convertloop32: + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a + vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu ymm1, [eax + esi] // src0 + vmovdqu ymm2, [edx + esi] // src1 + vpunpckhbw ymm4, ymm1, ymm2 + vpunpcklbw ymm1, ymm1, ymm2 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpmaddubsw ymm3, ymm3, ymm4 + vpmaddubsw ymm0, ymm0, ymm1 + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpsrlw ymm3, ymm3, 8 + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm3 + vmovdqu [edi + esi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg convertloop32 + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_BLENDPLANEROW_AVX2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; + +// Blend 8 pixels at a time. +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 0x0001 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + sub ecx, 4 + jl convertloop4b // less than 4 pixels? + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, xmmword ptr kShuffleAlpha0 + movdqa xmm5, xmmword ptr kShuffleAlpha1 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha + lea eax, [eax + 16] + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpshufb ymm2, ymm0, ymm4 // low 4 alphas + vpshufb ymm3, ymm1, ymm4 // high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * a + vpmulhuw ymm1, ymm1, ymm3 // rgb * a + vpand ymm6, ymm6, ymm5 // isolate alpha + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb + mov ecx, [esp + 12 + 12] // width + lea ebx, fixed_invtbl8 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr [ebx + esi * 4] + movd xmm3, dword ptr [ebx + edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqu xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr [ebx + esi * 4] + movd xmm3, dword ptr [ebx + edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + lea eax, [eax + 16] + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. +// USE_GATHER is not on by default, due to being a slow instruction. +#ifdef USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 + + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + vzeroupper + ret + } +} +#else // USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb + mov ecx, [esp + 12 + 12] // width + sub edx, eax + lea ebx, fixed_invtbl8 + vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 + + convertloop: + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 + vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] + vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] + vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] + vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] + vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + // end of VPGATHER + + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + vzeroupper + ret + } +} +#endif // USE_GATHER +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToYJ + movdqa xmm5, xmmword ptr kAddYJ64 + + convertloop: + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // Add .5 for rounding. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 G bytes + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone. +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; + +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; + +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, xmmword ptr kARGBToSepiaB + movdqa xmm3, xmmword ptr kARGBToSepiaG + movdqa xmm4, xmmword ptr kARGBToSepiaR + + convertloop: + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movdqu xmm5, [ecx] + pshufd xmm2, xmm5, 0x00 + pshufd xmm3, xmm5, 0x55 + pshufd xmm4, xmm5, 0xaa + pshufd xmm5, xmm5, 0xff + mov ecx, [esp + 16] /* width */ + + convertloop: + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm7, xmm2 + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm6, xmm3 + pmaddubsw xmm1, xmm3 + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] + pmaddubsw xmm1, xmm4 + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 + lea eax, [eax + 32] + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ + pshuflw xmm2, xmm2, 040h + pshufd xmm2, xmm2, 044h + pshuflw xmm3, xmm3, 040h + pshufd xmm3, xmm3, 044h + pshuflw xmm4, xmm4, 040h + pshufd xmm4, xmm4, 044h + pxor xmm5, xmm5 // constant 0 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 + pslld xmm6, 24 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 + movdqu xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm5 // next 2 pixels + pmulhuw xmm1, xmm2 + pmullw xmm0, xmm3 // * interval_size + movdqu xmm7, [eax] // read 4 pixels + pmullw xmm1, xmm3 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 + paddw xmm1, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm7 + movdqu [eax], xmm0 + lea eax, [eax + 16] + sub ecx, 4 + jg convertloop + ret + } +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pxor xmm5, xmm5 // constant 0 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb + movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// TODO(fbarchard): Port this to posix, neon and other math functions. +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + sub ecx, 4 + jl convertloop49 + + convertloop4: + movdqu xmm0, [eax] // read 4 pixels from src_argb + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + paddusb xmm0, xmm1 // src_argb + src_argb1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop49: + add ecx, 4 - 1 + jl convertloop19 + + convertloop1: + movd xmm0, [eax] // read 1 pixels from src_argb + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] + paddusb xmm0, xmm1 // src_argb + src_argb1 + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop19: + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + psubusb xmm0, xmm1 // src_argb - src_argb1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + + convertloop: + vmovdqu ymm1, [eax] // read 8 pixels from src_argb + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 + vpackuswb ymm0, ymm0, ymm1 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb + lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb + lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 + mov edi, [esp + 8 + 12] // src_y2 + mov edx, [esp + 8 + 16] // dst_sobelx + mov ecx, [esp + 8 + 20] // width + sub esi, eax + sub edi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 + mov edx, [esp + 4 + 12] // dst_sobely + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA + por xmm2, xmm5 + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA + por xmm0, xmm5 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + movdqa xmm2, xmm0 + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA + punpcklbw xmm3, xmm5 + punpckhbw xmm0, xmm5 + movdqa xmm4, xmm1 // YS + punpcklbw xmm4, xmm2 + punpckhbw xmm1, xmm2 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time. +// This function requires alignment on accumulation buffer pointers. +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm5, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + cmp area, 128 // 128 pixels will not overflow 15 bits. + ja l4 + + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts + + // 4 pixel loop small blocks. + s4: + // top left + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers + packssdw xmm2, xmm3 + + pmulhuw xmm0, xmm5 + pmulhuw xmm2, xmm5 + + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge s4 + + jmp l4b + + // 4 pixel loop + l4: + // top left + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + movdqu xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqu xmm2, [esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqu xmm3, [esi + 16] + paddd xmm3, xmm0 + + paddd xmm0, xmm4 + movdqu xmm4, [esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqu xmm5, [esi + 48] + lea esi, [esi + 64] + paddd xmm5, xmm0 + + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel + lea eax, [eax + 4] + punpcklbw xmm2, xmm1 + punpcklwd xmm2, xmm1 + paddd xmm0, xmm2 + movdqu xmm2, [esi] + lea esi, [esi + 16] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 12] // src_argb + mov esi, [esp + 16] // stride + mov edx, [esp + 20] // dst_argb + mov ecx, [esp + 24] // pointer to uv_dudv + movq xmm2, qword ptr [ecx] // uv + movq xmm7, qword ptr [ecx + 8] // dudv + mov ecx, [esp + 28] // width + shl esi, 16 // 4, stride + add esi, 4 + movd xmm5, esi + sub ecx, 4 + jl l4b + + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride + movdqa xmm0, xmm2 // x0, y0, x1, y1 + addps xmm0, xmm7 + movlhps xmm2, xmm0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 + + // 4 pixel loop + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd xmm6, [eax + edi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 + movq qword ptr [edx], xmm1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + movd xmm6, [eax + esi] // read pixel 2 + movd xmm0, [eax + edi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 + movq qword ptr 8[edx], xmm6 + lea edx, [edx + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 + movd xmm0, [eax + esi] // copy a pixel + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge l1 + l1b: + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + sub edi, esi + cmp eax, 128 + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + + vmovd xmm0, eax // high fraction 0..255 + neg eax + add eax, 256 + vmovd xmm5, eax // low fraction 256..1 + vpunpcklbw xmm5, xmm5, xmm0 + vpunpcklwd xmm5, xmm5, xmm5 + vbroadcastss ymm5, xmm5 + + mov eax, 0x80808080 // 128b for bias and rounding. + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + + xloop: + vmovdqu ymm0, [esi] + vmovdqu ymm2, [esi + edx] + vpunpckhbw ymm1, ymm0, ymm2 // mutates + vpunpcklbw ymm0, ymm0, ymm2 + vpsubb ymm1, ymm1, ymm4 // bias to signed image + vpsubb ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm5, ymm1 + vpmaddubsw ymm0, ymm5, ymm0 + vpaddw ymm1, ymm1, ymm4 // unbias and round + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 8 + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutates + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop + jmp xloop99 + + // Blend 50 / 50. + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop50 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + rep movsb + + xloop99: + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_INTERPOLATEROW_AVX2 + +// Bilinear filter 16x2 -> 16x1 +// TODO(fbarchard): Consider allowing 256 using memcpy. +__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 /256. Blend 100 / 0. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + + movd xmm0, eax // high fraction 0..255 + neg eax + add eax, 256 + movd xmm5, eax // low fraction 255..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm4, eax + pshufd xmm4, xmm4, 0x00 + + xloop: + movdqu xmm0, [esi] + movdqu xmm2, [esi + edx] + movdqu xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + psubb xmm0, xmm4 // bias image by -128 + psubb xmm1, xmm4 + movdqa xmm2, xmm5 + movdqa xmm3, xmm5 + pmaddubsw xmm2, xmm0 + pmaddubsw xmm3, xmm1 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + psrlw xmm2, 8 + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [esi + edi], xmm2 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop + jmp xloop99 + + // Blend 50 / 50. + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop50 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + movdqu xmm0, [esi] + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqu xmm5, [ecx] + mov ecx, [esp + 16] // width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg wloop + ret + } +} + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqu [edi], xmm1 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + + // 2 pixel loop. + convertloop: + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] + punpcklbw xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 + cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 + movdqa xmm1, xmm0 // X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] + movdqa xmm2, xmm1 + movdqa xmm6, xmm5 + mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 + mulps xmm1, xmm2 // X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] + addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 + addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 + cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 2 + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 + mov ecx, [esp + 16] /* width */ + + // 2 pixel loop. + convertloop: + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 2 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_HALFFLOATROW_SSE2 +static float kExpBias = 1.9259299444e-34f; +__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + mulss xmm4, kExpBias + pshufd xmm4, xmm4, 0 + pxor xmm5, xmm5 + sub edx, eax + + // 8 pixel loop. + convertloop: + movdqu xmm2, xmmword ptr [eax] // 8 shorts + add eax, 16 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm5 + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats + punpckhwd xmm3, xmm5 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + psrld xmm2, 13 + psrld xmm3, 13 + packssdw xmm2, xmm3 + movdqu [eax + edx - 16], xmm2 + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + + vmulss xmm4, xmm4, kExpBias + vbroadcastss ymm4, xmm4 + vpxor ymm5, ymm5, ymm5 + sub edx, eax + + // 16 pixel loop. + convertloop: + vmovdqu ymm2, [eax] // 16 shorts + add eax, 32 + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints + vpunpcklwd ymm2, ymm2, ymm5 + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats + vcvtdq2ps ymm2, ymm2 + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. + vmulps ymm2, ymm2, ymm4 + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm2, ymm2, 13 + vpackssdw ymm2, ymm2, ymm3 + vmovdqu [eax + edx - 32], ymm2 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + vbroadcastss ymm4, [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + sub edx, eax + + // 16 pixel loop. + convertloop: + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts + add eax, 32 + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm3, ymm3 + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 + vmulps ymm3, ymm3, ymm4 + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm3, ymm3, 3 + vmovdqu [eax + edx + 32], xmm2 + vmovdqu [eax + edx + 32 + 16], xmm3 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 + + // 4 pixel loop. + convertloop: + movdqu xmm0, xmmword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + lea eax, [eax + 16] + lea edi, [edi + 16] + sub ecx, 4 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(_M_X64) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) diff --git a/source/scale.cc b/source/scale.cc new file mode 100644 index 00000000..80b030dc --- /dev/null +++ b/source/scale.cc @@ -0,0 +1,2592 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyPlane +#include "libyuv/row.h" +#include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" // For UVScale + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + +static void ScalePlaneDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); + int row_stride = src_stride * 2; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON + : ScaleRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_NEON + : ScaleRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 + : ScaleRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 + : ScaleRowDown2Box_SSSE3); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_AVX2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 + : ScaleRowDown2Box_Any_AVX2); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_AVX2 + : ScaleRowDown2Box_AVX2); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA + : ScaleRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MSA + : ScaleRowDown2Box_MSA); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX + : ScaleRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_LSX + : ScaleRowDown2Box_LSX); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_16_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C + : ScaleRowDown2Box_16_C); + int row_stride = src_stride * 2; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; + } +#endif +#if defined(HAS_SCALEROWDOWN2_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_16_SSE2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 + : ScaleRowDown2Box_16_SSE2); + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +void ScalePlaneDown2_16To8(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width, int scale) = + (src_width & 1) + ? (filtering == kFilterNone + ? ScaleRowDown2_16To8_Odd_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_Odd_C + : ScaleRowDown2Box_16To8_Odd_C)) + : (filtering == kFilterNone + ? ScaleRowDown2_16To8_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C + : ScaleRowDown2Box_16To8_C)); + int row_stride = src_stride * 2; + (void)dst_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < src_height / 2; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width, scale); + src_ptr += row_stride; + dst_ptr += dst_stride; + } + if (src_height & 1) { + if (!filtering) { + src_ptr -= src_stride; // Point to last row. + } + ScaleRowDown2(src_ptr, 0, dst_ptr, dst_width, scale); + } +} + +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + +static void ScalePlaneDown4(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride * 4; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown4_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; + int row_stride = src_stride * 4; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; + } +#endif +#if defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane down, 3/4 +static void ScalePlaneDown34(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; + } +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MSA; + ScaleRowDown34_1 = ScaleRowDown34_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_Any_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_LSX; + ScaleRowDown34_1 = ScaleRowDown34_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown34_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_C; + ScaleRowDown34_1 = ScaleRowDown34_16_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN34_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_16_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + +static void ScalePlaneDown38(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + (void)src_width; + (void)src_height; + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; + } + +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; + } + if (dst_width % 12 == 0 && !filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } + if (dst_width % 6 == 0 && filtering) { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MSA; + ScaleRowDown38_2 = ScaleRowDown38_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_Any_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_LSX; + ScaleRowDown38_2 = ScaleRowDown38_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX; + } + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown38_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_C; + ScaleRowDown38_2 = ScaleRowDown38_16_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN38_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_16_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +#define MIN1(x) ((x) < 1 ? 1 : (x)) + +static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { + uint32_t sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { + uint32_t sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + int scaletbl_index = boxwidth - minboxwidth; + assert((scaletbl_index == 0) || (scaletbl_index == 1)); + *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) * + scaletbl[scaletbl_index] >> + 16); + } +} + +static void ScaleAddCols2_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + int scaletbl_index = boxwidth - minboxwidth; + assert((scaletbl_index == 0) || (scaletbl_index == 1)); + *dst_ptr++ = + SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16; + } +} + +static void ScaleAddCols0_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { + int scaleval = 65536 / boxheight; + int i; + (void)dx; + src_ptr += (x >> 16); + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16); + } +} + +static void ScaleAddCols1_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + x >>= 16; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16); + x += boxwidth; + } +} + +static void ScaleAddCols1_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. +static void ScalePlaneBox(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint16_t. + align_buffer_64(row16, src_width * 2); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint16_t* src_ptr, uint8_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C + : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, + int src_width) = ScaleAddRow_C; +#if defined(HAS_SCALEADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleAddRow = ScaleAddRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_SSE2; + } + } +#endif +#if defined(HAS_SCALEADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleAddRow = ScaleAddRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + ScaleAddRow = ScaleAddRow_AVX2; + } + } +#endif +#if defined(HAS_SCALEADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleAddRow = ScaleAddRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_NEON; + } + } +#endif +#if defined(HAS_SCALEADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleAddRow = ScaleAddRow_Any_MSA; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_MSA; + } + } +#endif +#if defined(HAS_SCALEADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleAddRow = ScaleAddRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_LSX; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8_t* src = src_ptr + iy * (int64_t)src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row16, 0, src_width * 2); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint16_t*)(row16), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row16); + } +} + +static void ScalePlaneBox_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint32_t. + align_buffer_64(row32, src_width * 4); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint32_t* src_ptr, uint16_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; + void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, + int src_width) = ScaleAddRow_16_C; + +#if defined(HAS_SCALEADDROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_16_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16_t* src = src_ptr + iy * (int64_t)src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row32, 0, src_width * 4); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint32_t*)(row32), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row32); + } +} + +// Scale plane down with bilinear interpolation. +void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_ptr + yi * (int64_t)src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +void ScalePlaneBilinearDown_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width * 2); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; + void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_16_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_16_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_16_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_16_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif + +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint16_t* src = src_ptr + yi * (int64_t)src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +// Scale up down with bilinear interpolation. +void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_C; + } +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint8_t* src = src_ptr + yi * (int64_t)src_stride; + + // Allocate 2 row buffers. + const int row_size = (dst_width + 31) & ~31; + align_buffer_64(row, row_size * 2); + + uint8_t* rowptr = row; + int rowstride = row_size; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + if (src_height > 2) { + src += src_stride; + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * (int64_t)src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + if ((y + 65536) < max_y) { + src += src_stride; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale plane, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of I422 to I444. +void ScalePlaneUp2_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of I420 to I444. +void ScalePlaneUp2_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale at most 14 bit plane, horizontally up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I210 to I410 and I212 to I412. +void ScalePlaneUp2_12_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale at most 12 bit plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I010 to I410 and I012 to I412. +void ScalePlaneUp2_12_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +void ScalePlaneUp2_16_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +void ScalePlaneUp2_16_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +void ScalePlaneBilinearUp_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_16_Any_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_16_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_16_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_16_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_16_C; + } +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_16_SSE2; + } +#endif + } + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint16_t* src = src_ptr + yi * (int64_t)src_stride; + + // Allocate 2 row buffers. + const int row_size = (dst_width + 31) & ~31; + align_buffer_64(row, row_size * 4); + + uint16_t* rowptr = (uint16_t*)row; + int rowstride = row_size; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + if (src_height > 2) { + src += src_stride; + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * (int64_t)src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + if ((y + 65536) < max_y) { + src += src_stride; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale Plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScalePlaneSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + int i; + void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); + dst_ptr += dst_stride; + y += dy; + } +} + +static void ScalePlaneSimple_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + int i; + void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_16_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale a plane. +// This function dispatches to a specialized scaler based on scale factor. +LIBYUV_API +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t)src_stride; + src_stride = -src_stride; + } + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width && filtering != kFilterBox) { + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + (filtering == kFilterBox || filtering == kFilterNone)) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + return; + } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); +} + +LIBYUV_API +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t)src_stride; + src_stride = -src_stride; + } + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width && filtering != kFilterBox) { + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + // When scaling up, ensure the last row of destination uses the last + // source. Avoid divide by zero for dst_height but will do no scaling + // later. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { + // optimized, 3/8 + ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + (filtering == kFilterBox || filtering == kFilterNone)) { + // optimized, 1/4 + ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + return; + } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); +} + +LIBYUV_API +void ScalePlane_12(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t)src_stride; + src_stride = -src_stride; + } + + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + + ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride, + dst_width, dst_height, filtering); +} + +// Scale an I420 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + +LIBYUV_API +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + +LIBYUV_API +int I420Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + +// Scale an I444 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I444Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I444Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I444Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + +// Scale an I422 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I422Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I422Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I422Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +// Scale an NV12 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int NV12Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_uv || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, + dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + +// Deprecated api +LIBYUV_API +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, + LIBYUV_BOOL interpolate) { + return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_width, src_height, dst_y, dst_stride_y, + dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, + dst_height, interpolate ? kFilterBox : kFilterNone); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_any.cc b/source/scale_any.cc new file mode 100644 index 00000000..f6576874 --- /dev/null +++ b/source/scale_any.cc @@ -0,0 +1,1078 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // For memset/memcpy + +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Fixed scale down. +// Mask may be non-power of 2, so use MOD +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } + +// Fixed scale down for odd source width. Used by I420Blend subsampling. +// Since dst_width is (width + 1) / 2, this function scales one less pixel +// and copies the last pixel. +#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ + int n = (dst_width - 1) - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r + 1); \ + } + +#ifdef HAS_SCALEROWDOWN2_SSSE3 +SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, + ScaleRowDown2Linear_SSSE3, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +SDANY(ScaleUVRowDown2Box_Any_SSSE3, + ScaleUVRowDown2Box_SSSE3, + ScaleUVRowDown2Box_C, + 2, + 2, + 3) +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +SDANY(ScaleUVRowDown2Box_Any_AVX2, + ScaleUVRowDown2Box_AVX2, + ScaleUVRowDown2Box_C, + 2, + 2, + 7) +#endif +#ifdef HAS_SCALEROWDOWN2_AVX2 +SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, + ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_C, + 2, + 1, + 31) +SDODD(ScaleRowDown2Box_Odd_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 31) +#endif +#ifdef HAS_SCALEROWDOWN2_NEON +SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, + ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEUVROWDOWN2_NEON +SDANY(ScaleUVRowDown2_Any_NEON, + ScaleUVRowDown2_NEON, + ScaleUVRowDown2_C, + 2, + 2, + 7) +#endif +#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON +SDANY(ScaleUVRowDown2Linear_Any_NEON, + ScaleUVRowDown2Linear_NEON, + ScaleUVRowDown2Linear_C, + 2, + 2, + 7) +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_NEON +SDANY(ScaleUVRowDown2Box_Any_NEON, + ScaleUVRowDown2Box_NEON, + ScaleUVRowDown2Box_C, + 2, + 2, + 7) +#endif + +#ifdef HAS_SCALEROWDOWN2_MSA +SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_MSA, + ScaleRowDown2Linear_MSA, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_MSA, + ScaleRowDown2Box_MSA, + ScaleRowDown2Box_C, + 2, + 1, + 31) +#endif +#ifdef HAS_SCALEROWDOWN2_LSX +SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_LSX, + ScaleRowDown2Linear_LSX, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_LSX, + ScaleRowDown2Box_LSX, + ScaleRowDown2Box_C, + 2, + 1, + 31) +#endif +#ifdef HAS_SCALEROWDOWN4_SSSE3 +SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, + ScaleRowDown4Box_SSSE3, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_AVX2 +SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, + ScaleRowDown4Box_AVX2, + ScaleRowDown4Box_C, + 4, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN4_NEON +SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, + ScaleRowDown4Box_NEON, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_MSA +SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_MSA, + ScaleRowDown4Box_MSA, + ScaleRowDown4Box_C, + 4, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN4_LSX +SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_LSX, + ScaleRowDown4Box_LSX, + ScaleRowDown4Box_C, + 4, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN34_SSSE3 +SDANY(ScaleRowDown34_Any_SSSE3, + ScaleRowDown34_SSSE3, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, + ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, + ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) +#endif +#ifdef HAS_SCALEROWDOWN34_NEON +SDANY(ScaleRowDown34_Any_NEON, + ScaleRowDown34_NEON, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, + ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, + ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) +#endif +#ifdef HAS_SCALEROWDOWN34_MSA +SDANY(ScaleRowDown34_Any_MSA, + ScaleRowDown34_MSA, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_MSA, + ScaleRowDown34_0_Box_MSA, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_MSA, + ScaleRowDown34_1_Box_MSA, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) +#endif +#ifdef HAS_SCALEROWDOWN34_LSX +SDANY(ScaleRowDown34_Any_LSX, + ScaleRowDown34_LSX, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_LSX, + ScaleRowDown34_0_Box_LSX, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_LSX, + ScaleRowDown34_1_Box_LSX, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) +#endif +#ifdef HAS_SCALEROWDOWN38_SSSE3 +SDANY(ScaleRowDown38_Any_SSSE3, + ScaleRowDown38_SSSE3, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, + ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, + ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 5) +#endif +#ifdef HAS_SCALEROWDOWN38_NEON +SDANY(ScaleRowDown38_Any_NEON, + ScaleRowDown38_NEON, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, + ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, + ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif +#ifdef HAS_SCALEROWDOWN38_MSA +SDANY(ScaleRowDown38_Any_MSA, + ScaleRowDown38_MSA, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_MSA, + ScaleRowDown38_3_Box_MSA, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_MSA, + ScaleRowDown38_2_Box_MSA, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif +#ifdef HAS_SCALEROWDOWN38_LSX +SDANY(ScaleRowDown38_Any_LSX, + ScaleRowDown38_LSX, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_LSX, + ScaleRowDown38_3_Box_LSX, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_LSX, + ScaleRowDown38_2_Box_LSX, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif + +#ifdef HAS_SCALEARGBROWDOWN2_SSE2 +SDANY(ScaleARGBRowDown2_Any_SSE2, + ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, + ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, + ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_NEON +SDANY(ScaleARGBRowDown2_Any_NEON, + ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, + ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, + ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, + 2, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_MSA +SDANY(ScaleARGBRowDown2_Any_MSA, + ScaleARGBRowDown2_MSA, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_MSA, + ScaleARGBRowDown2Linear_MSA, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_MSA, + ScaleARGBRowDown2Box_MSA, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_LSX +SDANY(ScaleARGBRowDown2_Any_LSX, + ScaleARGBRowDown2_LSX, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_LSX, + ScaleARGBRowDown2Linear_LSX, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_LSX, + ScaleARGBRowDown2Box_LSX, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif +#undef SDANY + +// Scale down by even scale factor. +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8_t* dst_ptr, int dst_width) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 +SDAANY(ScaleARGBRowDownEven_Any_SSE2, + ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, + ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +SDAANY(ScaleARGBRowDownEven_Any_NEON, + ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, + ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA +SDAANY(ScaleARGBRowDownEven_Any_MSA, + ScaleARGBRowDownEven_MSA, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, + ScaleARGBRowDownEvenBox_MSA, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX +SDAANY(ScaleARGBRowDownEven_Any_LSX, + ScaleARGBRowDownEven_LSX, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_LSX, + ScaleARGBRowDownEvenBox_LSX, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEUVROWDOWNEVEN_NEON +SDAANY(ScaleUVRowDownEven_Any_NEON, + ScaleUVRowDownEven_NEON, + ScaleUVRowDownEven_C, + 2, + 3) +#endif + +#ifdef SASIMDONLY +// This also works and uses memcpy and SIMD instead of C, but is slower on ARM + +// Add rows box filter scale down. Using macro from row_any +#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint16_t dst_temp[32]); \ + SIMD_ALIGNED(uint8_t src_temp[32]); \ + memset(dst_temp, 0, 32 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(src_temp, dst_temp, MASK + 1); \ + memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_MSA +SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_LSX +SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15) +#endif +#undef SAANY + +#else + +// Add rows box filter scale down. +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_MSA +SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_LSX +SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15) +#endif +#undef SAANY + +#endif // SASIMDONLY + +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEFILTERCOLS_LSX +CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBCOLS_LSX +CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_LSX +CANY(ScaleARGBFilterCols_Any_LSX, + ScaleARGBFilterCols_LSX, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#undef CANY + +// Scale up horizontally 2 times using linear filter. +#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 1, n); \ + } \ + C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ + } \ + dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \ + } + +// Even the C versions need to be wrapped, because boundary pixels have to +// be handled differently + +SUH2LANY(ScaleRowUp2_Linear_Any_C, + ScaleRowUp2_Linear_C, + ScaleRowUp2_Linear_C, + 0, + uint8_t) + +SUH2LANY(ScaleRowUp2_Linear_16_Any_C, + ScaleRowUp2_Linear_16_C, + ScaleRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, + ScaleRowUp2_Linear_SSE2, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, + ScaleRowUp2_Linear_SSSE3, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, + ScaleRowUp2_Linear_12_SSSE3, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, + ScaleRowUp2_Linear_16_SSE2, + ScaleRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 +SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, + ScaleRowUp2_Linear_AVX2, + ScaleRowUp2_Linear_C, + 31, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 +SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, + ScaleRowUp2_Linear_12_AVX2, + ScaleRowUp2_Linear_16_C, + 31, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, + ScaleRowUp2_Linear_16_AVX2, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_NEON +SUH2LANY(ScaleRowUp2_Linear_Any_NEON, + ScaleRowUp2_Linear_NEON, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON +SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, + ScaleRowUp2_Linear_12_NEON, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON +SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, + ScaleRowUp2_Linear_16_NEON, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#undef SUH2LANY + +// Scale up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 1, db - da, n); \ + } \ + C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ + } \ + da[dst_width - 1] = \ + (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ + db[dst_width - 1] = \ + (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ + } + +SU2BLANY(ScaleRowUp2_Bilinear_Any_C, + ScaleRowUp2_Bilinear_C, + ScaleRowUp2_Bilinear_C, + 0, + uint8_t) + +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, + ScaleRowUp2_Bilinear_16_C, + ScaleRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, + ScaleRowUp2_Bilinear_SSE2, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, + ScaleRowUp2_Bilinear_12_SSSE3, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, + ScaleRowUp2_Bilinear_16_SSE2, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, + ScaleRowUp2_Bilinear_SSSE3, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, + ScaleRowUp2_Bilinear_AVX2, + ScaleRowUp2_Bilinear_C, + 31, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, + ScaleRowUp2_Bilinear_12_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, + ScaleRowUp2_Bilinear_16_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_NEON +SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, + ScaleRowUp2_Bilinear_NEON, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON, + ScaleRowUp2_Bilinear_12_NEON, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, + ScaleRowUp2_Bilinear_16_NEON, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#undef SU2BLANY + +// Scale bi-planar plane up horizontally 2 times using linear filter. +#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + dst_ptr[1] = src_ptr[1]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 2, n); \ + } \ + C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ + } \ + dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ + dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ + } + +SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, + ScaleUVRowUp2_Linear_C, + ScaleUVRowUp2_Linear_C, + 0, + uint8_t) + +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, + ScaleUVRowUp2_Linear_16_C, + ScaleUVRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, + ScaleUVRowUp2_Linear_SSSE3, + ScaleUVRowUp2_Linear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 +SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, + ScaleUVRowUp2_Linear_AVX2, + ScaleUVRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41, + ScaleUVRowUp2_Linear_16_SSE41, + ScaleUVRowUp2_Linear_16_C, + 3, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, + ScaleUVRowUp2_Linear_16_AVX2, + ScaleUVRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON +SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, + ScaleUVRowUp2_Linear_NEON, + ScaleUVRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, + ScaleUVRowUp2_Linear_16_NEON, + ScaleUVRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#undef SBUH2LANY + +// Scale bi-planar plane up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ + db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 2, db - da, n); \ + } \ + C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ + } \ + da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ + sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ + db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ + 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ + da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ + sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ + db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ + 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ + } + +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, + ScaleUVRowUp2_Bilinear_C, + ScaleUVRowUp2_Bilinear_C, + 0, + uint8_t) + +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, + ScaleUVRowUp2_Bilinear_16_C, + ScaleUVRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, + ScaleUVRowUp2_Bilinear_SSSE3, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, + ScaleUVRowUp2_Bilinear_AVX2, + ScaleUVRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41, + ScaleUVRowUp2_Bilinear_16_SSE41, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, + ScaleUVRowUp2_Bilinear_16_AVX2, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, + ScaleUVRowUp2_Bilinear_NEON, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, + ScaleUVRowUp2_Bilinear_16_NEON, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#undef SBU2BLANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_argb.cc b/source/scale_argb.cc new file mode 100644 index 00000000..ddd8d29e --- /dev/null +++ b/source/scale_argb.cc @@ -0,0 +1,1157 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleARGBDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + filtering == kFilterNone + ? ScaleARGBRowDown2_C + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C + : ScaleARGBRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; + } else { + src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4; + } + +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 + : ScaleARGBRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 + : ScaleARGBRowDown2Box_SSE2); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON + : ScaleARGBRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON + : ScaleARGBRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_LSX + : ScaleARGBRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_LSX + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_LSX + : ScaleARGBRowDown2Box_LSX); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// ScaleARGB ARGB, 1/4 +// This is an optimized version for scaling down a ARGB to 1/4 of +// its original size. +static void ScaleARGBDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { + int j; + // Allocate 2 rows of ARGB. + const int row_size = (dst_width * 2 * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + ScaleARGBRowDown2Box_C; + // Advance to odd row, even column. + src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + row_size, + dst_width * 2); + ScaleARGBRowDown2(row, row_size, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } + free_aligned_buffer_64(row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleARGBDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride); + void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, + int src_step, uint8_t* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + (void)src_width; + (void)src_height; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 + : ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON + : ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_LSX + : ScaleARGBRowDownEven_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_LSX : ScaleARGBRowDownEven_LSX; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// Scale ARGB down with bilinear interpolation. +static void ScaleARGBBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + src_argb += xl * 4; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + { + align_buffer_64(row, clip_src_width * 4); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_argb + yi * (intptr_t)src_stride; + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} + +// Scale ARGB up with bilinear interpolation. +static void ScaleARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_LSX) + if (!filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_LSX; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8_t* src = src_argb + yi * (intptr_t)src_stride; + + // Allocate 2 rows of ARGB. + const int row_size = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); + + uint8_t* rowptr = row; + int rowstride = row_size; + int lasty = yi; + + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); + if (src_height > 2) { + src += src_stride; + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_argb + yi * (intptr_t)src_stride; + } + if (yi != lasty) { + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + if ((y + 65536) < max_y) { + src += src_stride; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +#ifdef YUVSCALEUP +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, int width) = + I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(src_width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(src_width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif + + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + if (src_width >= 32768) { + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_LSX) + if (!filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_LSX; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int yi = y >> 16; + int uv_yi = yi >> kYShift; + const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v; + + // Allocate 2 rows of ARGB. + const int row_size = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_row, src_width * 4); + + uint8_t* rowptr = row; + int rowstride = row_size; + int lasty = yi; + + // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. + ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); + if (src_height > 1) { + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); + if (src_height > 2) { + src_row_y += src_stride_y; + if (!(yi & 1)) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + uv_yi = yi >> kYShift; + src_row_y = src_y + yi * (intptr_t)src_stride_y; + src_row_u = src_u + uv_yi * (intptr_t)src_stride_u; + src_row_v = src_v + uv_yi * (intptr_t)src_stride_v; + } + if (yi != lasty) { + // TODO(fbarchard): Convert the clipped region of row. + I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); + ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(row_argb); +} +#endif + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleARGBSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { + int j; + void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + (void)src_height; +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_LSX; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride, + dst_width, x, dx); + dst_argb += dst_stride; + y += dy; + } +} + +// ScaleARGB a ARGB. +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleARGB(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (intptr_t)src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64_t clipf = (int64_t)(clip_x)*dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 4; + dst += clip_x * 4; + } + if (clip_y) { + int64_t clipf = (int64_t)(clip_y)*dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * (intptr_t)src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleARGBDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); + return; + } + ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4, + src_stride, dst, dst_stride, clip_width, clip_height); + return; + } + } + } + } + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, /*bpp=*/4, filtering); + return; + } + if (filtering && dy < 65536) { + ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + if (filtering) { + ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); +} + +LIBYUV_API +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || + dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || + clip_width > 32768 || clip_height > 32768 || + (clip_x + clip_width) > dst_width || + (clip_y + clip_height) > dst_height) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, + clip_height, filtering); + return 0; +} + +// Scale an ARGB image. +LIBYUV_API +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, + filtering); + return 0; +} + +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); + int r; + (void)src_fourcc; // TODO(fbarchard): implement and/or assert. + (void)dst_fourcc; + I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + argb_buffer, src_width * 4, src_width, src_height); + + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, + clip_width, clip_height, filtering); + free(argb_buffer); + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_common.cc b/source/scale_common.cc new file mode 100644 index 00000000..77455903 --- /dev/null +++ b/source/scale_common.cc @@ -0,0 +1,1999 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef __cplusplus +#define STATIC_CAST(type, expr) static_cast(expr) +#else +#define STATIC_CAST(type, expr) (type)(expr) +#endif + +// TODO(fbarchard): make clamp255 preserve negative values. +static __inline int32_t clamp255(int32_t v) { + return (-(v >= 255) | v) & 255; +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale)); + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + } +} + +void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale)); + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + dst += 1; + src_ptr += 2; + } + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[0], scale)); +} + +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale)); + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + } +} + +void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale)); + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + dst += 1; + s += 2; + } + dst[0] = STATIC_CAST(uint8_t, C16TO8(s[0], scale)); +} + +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst += 1; + s += 2; + t += 2; + } + dst[0] = (s[0] + t[0] + 1) >> 1; +} + +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + dst[1] = STATIC_CAST(uint8_t, + C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale)); + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + } +} + +void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert(scale >= 256); + assert(scale <= 32768); + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + dst[1] = STATIC_CAST(uint8_t, + C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale)); + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + dst += 1; + s += 2; + t += 2; + } + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + t[0] + 1) >> 1, scale)); +} + +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + } +} + +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + } +} + +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +void ScaleRowDown34_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Sample position: (O is src sample position, X is dst sample position) +// +// v dst_ptr at here v stop at here +// X O X X O X X O X X O X X O X +// ^ src_ptr at here +void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Sample position: (O is src sample position, X is dst sample position) +// +// src_ptr at here +// X v X X X X X X X X X +// O O O O O +// X X X X X X X X X X +// ^ dst_ptr at here ^ stop at here +// X X X X X X X X X X +// O O O O O +// X X X X X X X X X X +void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + +// Only suitable for at most 14 bit range. +void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Only suitable for at most 12bit range. +void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + uint16_t* d = dst_ptr; + uint16_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#if defined(__arm__) || defined(__aarch64__) +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#else +// Intel uses 7 bit math with rounding. +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +#endif + +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +// Same as 8 bit arm blender but return is cast to uint16_t +#define BLENDER(a, b, f) \ + (uint16_t)( \ + (int)(a) + \ + (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536u / 9u) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536u / 9u) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536u / 6u) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536u / 6u) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536u / 6u) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536u / 4u) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +// ARGB scale row functions + +void ScaleARGBRowDown2_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + (void)src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ + BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// UV scale row functions +// same as ARGB but 2 channels + +void ScaleUVRowDown2_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = src_uv[2]; // Store the 2nd UV + dst_uv[1] = src_uv[3]; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; + dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Box_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDownEven_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + (void)src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += src_stepx * 2; + dst_uv += 2; + } +} + +void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + +void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + uint16_t* d = dst_ptr; + uint16_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleUVCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleUVCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleUVColsUp2_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleUVFilterCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleUVFilterCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, // bytes per pixel. 4 for ARGB. + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + int dst_width_bytes = dst_width * bpp; + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * bpp; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} + +void ScalePlaneVertical_16(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, /* words per pixel. normally 1 */ + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_16_Any_SSE2; + if (IS_ALIGNED(dst_width_words, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_16_Any_SSSE3; + if (IS_ALIGNED(dst_width_words, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_16_Any_AVX2; + if (IS_ALIGNED(dst_width_words, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_16_Any_NEON; + if (IS_ALIGNED(dst_width_words, 8)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +void ScalePlaneVertical_16To8(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int wpp, /* words per pixel. normally 1 */ + int scale, + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions. + void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb, + ptrdiff_t src_stride, int scale, int dst_width, + int source_y_fraction) = InterpolateRow_16To8_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; + +#if defined(HAS_INTERPOLATEROW_16TO8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow_16To8 = InterpolateRow_16To8_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16TO8_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow_16To8 = InterpolateRow_16To8_AVX2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride, + scale, dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return (int)(((int64_t)(num) << 16) / div); +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div) { + return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Compute slope values for stepping. +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering, + int* x, + int* y, + int* dx, + int* dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + // Check for 1 pixel and avoid FixedDiv overflow. + if (dst_width == 1 && src_width >= 32768) { + dst_width = src_width; + } + if (dst_height == 1 && src_height >= 32768) { + dst_height = src_height; + } + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_width > 1 && dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_height > 1 && dst_height > 1) { + *dy = FixedDiv1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_width > 1 && dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + // src_width = -src_width; // Caller must do this. + } +} +#undef CENTERSTART + +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2 = src_ptr + src_stride; + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; + ++src_ptr; + ++src2; + dst += 2; + } + if (dst_width & 1) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc new file mode 100644 index 00000000..17eeffad --- /dev/null +++ b/source/scale_gcc.cc @@ -0,0 +1,2953 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 0 to 10 +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; + +// Coefficients for source bytes 0 to 10 +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; + +// Coefficients for source bytes 10 to 21 +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; + +// Coefficients for source bytes 21 to 31 +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; + +// Coefficients for source bytes 21 to 31 +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; + +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 0,1,2 +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 3,4,5 +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x3 and 2x3 +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; + +// Arrange first value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; + +// Arrange second value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; + +// Arrange third value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x2 and 2x2 +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEROWDOWN2_AVX2 +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile(LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SCALEROWDOWN2_AVX2 + +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} + +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + intptr_t stridex3; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#ifdef HAS_SCALEROWDOWN4_AVX2 +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} + +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEROWDOWN4_AVX2 + +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, + 10, 11, 8, 9, 14, 15, 12, 13}; + +static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, + 3, 1, 1, 3, 3, 1, 1, 3}; + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm0,%%xmm0 \n" // 0 + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $1,%%xmm6 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm6,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" + "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) + "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm6,%%xmm1 \n" + "paddw %%xmm3,%%xmm3 \n" + "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + LABELALIGN + "1: \n" + "pxor %%xmm0,%%xmm0 \n" // 0 + // above line + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" // near+far + "movdqa %%xmm3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" // 2*near + "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + // below line + "movq (%0,%3),%%xmm6 \n" // 01234567 + "movq 1(%0,%3),%%xmm2 \n" // 12345678 + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm6,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) + "paddw %%xmm7,%%xmm5 \n" // near+far + "movdqa %%xmm3,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) + "paddw %%xmm7,%%xmm7 \n" // 2*near + "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) + + "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm6,%%xmm2 \n" // near+far + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) + + // xmm4 xmm1 + // xmm5 xmm2 + "pcmpeqw %%xmm0,%%xmm0 \n" + "psrlw $15,%%xmm0 \n" + "psllw $3,%%xmm0 \n" // all 8 + + "movdqa %%xmm4,%%xmm3 \n" + "movdqa %%xmm5,%%xmm6 \n" + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) + "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + + "movdqa %%xmm1,%%xmm7 \n" + "movdqa %%xmm2,%%xmm6 \n" + "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) + "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm7 \n" // ^ div by 16 + + "packuswb %%xmm7,%%xmm3 \n" + "movdqu %%xmm3,(%1) \n" // save above line + + "movdqa %%xmm5,%%xmm3 \n" + "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) + "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 + + "movdqa %%xmm2,%%xmm3 \n" + "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) + "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) + "psrlw $4,%%xmm2 \n" // ^ div by 16 + + "packuswb %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // save below line + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %3,%%xmm5 \n" + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) + + "paddw %%xmm4,%%xmm1 \n" // far+2 + "paddw %%xmm4,%%xmm3 \n" // far+2 + "paddw %%xmm0,%%xmm1 \n" // near+far+2 + "paddw %%xmm2,%%xmm3 \n" // near+far+2 + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) + + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,16(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" + "psllw $3,%%xmm7 \n" // all 8 + "movdqa %5,%%xmm6 \n" + + LABELALIGN + "1: \n" + // above line + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) + "paddw %%xmm0,%%xmm1 \n" // near+far + "paddw %%xmm2,%%xmm3 \n" // near+far + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) + + // below line + "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) + "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) + "movdqa %%xmm1,%%xmm3 \n" + "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) + "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) + "movdqa %%xmm3,%%xmm5 \n" + "movdqa %%xmm1,%%xmm4 \n" + "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) + "paddw %%xmm1,%%xmm4 \n" // near+far + "paddw %%xmm3,%%xmm5 \n" // near+far + "paddw %%xmm1,%%xmm1 \n" // 2*near + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) + "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,(%1) \n" + + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,0x10(%1) \n" + + "movdqa %%xmm1,%%xmm4 \n" + "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) + "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm1 \n" // ^ div by 16 + "movdqu %%xmm1,(%1,%4,2) \n" + + "movdqa %%xmm3,%%xmm4 \n" + "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + "movdqu %%xmm3,0x10(%1,%4,2) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packssdw %%xmm1,%%xmm0 \n" + "pshufd $0b11011000,%%xmm0,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + "paddd %%xmm0,%%xmm2 \n" // near+far (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 2(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) + "paddd %%xmm2,%%xmm4 \n" // near+far (lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packssdw %%xmm0,%%xmm4 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packssdw %%xmm2,%%xmm5 \n" + "pshufd $0b11011000,%%xmm5,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqa %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqa %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 1(%0,%3),%%xmm4 \n" + "punpcklwd %%xmm1,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm4 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm4,%%xmm3 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 +void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vbroadcastf128 %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 +void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vbroadcastf128 %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 +void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vbroadcastf128 %3,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) + "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) + + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 + + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 + "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 + + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far + "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,32(%1) \n" + + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) + + "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) + + "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) + "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1) \n" // store above + + "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) + "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) + "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 +void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +// Reads 16xN bytes and produces 16 shorts at a time. +void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile("pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SCALEADDROW_AVX2 + +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// Constant for making pixels unsigned and adding .5 for rounding. +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + intptr_t x0, x1, temp_pixel; + asm volatile( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + // 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 +#if defined(__x86_64__) + "+rm"(dst_width) // %5 +#else + "+m"(dst_width) // %5 +#endif + : "rm"(x), // %6 + "rm"(dx), // %7 +#if defined(__x86_64__) + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 +#else + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile(LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; + (void)src_stride; + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "=&r"(src_stepx_x12) // %4 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; + intptr_t row1 = (intptr_t)(src_stride); + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "=&r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + intptr_t x0, x1; + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + LABELALIGN + "40: \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "99: \n" + : "=&a"(x0), // %0 + "=&d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile(LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + intptr_t x0, x1; + asm volatile( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" + + LABELALIGN + "99: \n" // clang-format error. + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "=&r"(x0), // %3 + "=&r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); + return num; +} + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ + defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + +// Shuffle table for splitting UV into upper and lower part of register. +static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; +static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, + 6u, 14u, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}; +#endif + +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 + +void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5, %%xmm5 \n" // zero + "movdqa %4,%%xmm1 \n" // split shuffler + "movdqa %5,%%xmm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 8 UV row 0 + "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 + "lea 0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv + "pshufb %%xmm1,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add + "pmaddubsw %%xmm4,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" // vertical add + "psrlw $0x1,%%xmm0 \n" // round + "pavgw %%xmm5,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" // merge uv + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" // 4 UV + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 + +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero + "vbroadcastf128 %4,%%ymm1 \n" // split shuffler + "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 + "lea 0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv + "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" // 8 UV + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 + +static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, + 3, 1, 3, 1, 1, 3, 1, 3}; + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqa %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqa %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 2(%0,%3),%%xmm4 \n" + "punpcklbw %%xmm4,%%xmm1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm1,%%xmm3 \n" + "punpckldq %%xmm1,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 + +void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vbroadcastf128 %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 +void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vbroadcastf128 %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) + "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packusdw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 4(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) + "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) + "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packusdw %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packusdw %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 +void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 +void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_lsx.cc b/source/scale_lsx.cc new file mode 100644 index 00000000..bfe5e9fb --- /dev/null +++ b/source/scale_lsx.cc @@ -0,0 +1,739 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/scale_row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_DATA(_src, _in, _out) \ + { \ + int _tmp1, _tmp2, _tmp3, _tmp4; \ + DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \ + _tmp2, _tmp3, _tmp4); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \ + } + +void ScaleARGBRowDown2_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + (void)src_stride; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + dst0 = __lsx_vpickod_w(src1, src0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + (void)src_stride; + __m128i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_w(src1, src0); + tmp1 = __lsx_vpickod_w(src1, src0); + dst0 = __lsx_vavgr_bu(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0; + __m128i reg0, reg1, reg2, reg3; + __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1); + DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2, + shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1); + dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); + __lsx_vst(dst0, dst_argb, 0); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + int32_t stepx = src_stepx << 2; + (void)src_stride; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + dst0 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst1 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst2 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst3 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + __lsx_vstelm_w(dst0, dst_argb, 0, 0); + __lsx_vstelm_w(dst1, dst_argb, 4, 0); + __lsx_vstelm_w(dst2, dst_argb, 8, 0); + __lsx_vstelm_w(dst3, dst_argb, 12, 0); + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + int32_t stepx = src_stepx * 4; + const uint8_t* next_argb = src_argb + src_stride; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, dst0; + + for (x = 0; x < len; x++) { + tmp0 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp1 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp2 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp3 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp4 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp5 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp6 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp7 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5); + DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1); + dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); + dst0 = __lsx_vshuf4i_b(dst0, 0xD8); + __lsx_vst(dst0, dst_argb, 0); + dst_argb += 16; + } +} + +void ScaleRowDown2_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + __m128i src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + const uint8_t* src_nex = src_ptr + src_stride; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + src_nex += 64; + dst += 32; + } +} + +void ScaleRowDown4_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 16; + __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst, 0); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 16; + const uint8_t* ptr1 = src_ptr + src_stride; + const uint8_t* ptr2 = ptr1 + src_stride; + const uint8_t* ptr3 = ptr2 + src_stride; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, reg0, reg1, reg2, reg3); + DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1); + dst0 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(dst0, dst, 0); + src_ptr += 64; + ptr1 += 64; + ptr2 += 64; + ptr3 += 64; + dst += 16; + } +} + +void ScaleRowDown38_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, len; + __m128i src0, src1, tmp0; + __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816}; + + assert(dst_width % 3 == 0); + len = dst_width / 12; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + tmp0 = __lsx_vshuf_b(src1, src0, shuff); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst, 8, 2); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, len; + const uint8_t* src_nex = src_ptr + src_stride; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3; + __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; + __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA); + __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + len = dst_width / 12; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); + tmp4 = __lsx_vpickev_w(reg3, reg2); + tmp5 = __lsx_vadd_h(reg0, reg1); + tmp6 = __lsx_vadd_h(tmp5, tmp4); + tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA); + tmp0 = __lsx_vpickod_w(reg3, reg2); + tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); + tmp2 = __lsx_vmul_w(tmp1, const_0x4000); + dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); + __lsx_vstelm_d(dst0, dst_ptr, 0, 0); + __lsx_vstelm_w(dst0, dst_ptr, 8, 2); + src_ptr += 32; + src_nex += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, len; + const uint8_t* ptr1 = src_ptr + src_stride; + const uint8_t* ptr2 = ptr1 + src_stride; + __m128i src0, src1, src2, src3, src4, src5; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i zero = __lsx_vldi(0); + __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; + __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71); + __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + len = dst_width / 12; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1, + src2, src3); + DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6); + DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); + tmp4 = __lsx_vpickev_w(reg3, reg2); + tmp5 = __lsx_vadd_h(reg0, reg1); + tmp6 = __lsx_vadd_h(tmp5, tmp4); + tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71); + tmp0 = __lsx_vpickod_w(reg3, reg2); + tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); + tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA); + dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); + __lsx_vstelm_d(dst0, dst_ptr, 0, 0); + __lsx_vstelm_w(dst0, dst_ptr, 8, 2); + src_ptr += 32; + ptr1 += 32; + ptr2 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + int len = src_width / 16; + __m128i src0, tmp0, tmp1, dst0, dst1; + __m128i zero = __lsx_vldi(0); + + assert(src_width > 0); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1); + tmp0 = __lsx_vilvl_b(zero, src0); + tmp1 = __lsx_vilvh_b(zero, src0); + DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + int len = dst_width / 16; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec0, vec1, dst0; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF); + __m128i const2 = __lsx_vreplgr2vr_w(0x40); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + + vec0 = __lsx_vmul_w(vec_dx, const_tmp); + vec1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, vec0); + + for (j = 0; j < len; j++) { + tmp0 = __lsx_vsrai_w(vec_x, 16); + tmp4 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp1 = __lsx_vsrai_w(vec_x, 16); + tmp5 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp2 = __lsx_vsrai_w(vec_x, 16); + tmp6 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp3 = __lsx_vsrai_w(vec_x, 16); + tmp7 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5, + tmp6, tmp7); + LOAD_DATA(src_ptr, tmp0, reg0); + LOAD_DATA(src_ptr, tmp1, reg1); + LOAD_DATA(src_ptr, tmp2, reg2); + LOAD_DATA(src_ptr, tmp3, reg3); + DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1, + tmp2, tmp3); + LOAD_DATA(src_ptr, tmp0, reg4); + LOAD_DATA(src_ptr, tmp1, reg5); + LOAD_DATA(src_ptr, tmp2, reg6); + LOAD_DATA(src_ptr, tmp3, reg7); + DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7, + const2, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5, + reg6, reg7); + DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1); + dst0 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)src_argb; + uint32_t* dst = (uint32_t*)dst_argb; + int j; + int len = dst_width / 4; + __m128i tmp0, tmp1, tmp2, dst0; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + + tmp0 = __lsx_vmul_w(vec_dx, const_tmp); + tmp1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, tmp0); + + for (j = 0; j < len; j++) { + tmp2 = __lsx_vsrai_w(vec_x, 16); + vec_x = __lsx_vadd_w(vec_x, tmp1); + LOAD_DATA(src, tmp2, dst0); + __lsx_vst(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)src_argb; + int j; + int len = dst_width / 8; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec0, vec1, dst0, dst1; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + __m128i const_7f = __lsx_vldi(0x7F); + + vec0 = __lsx_vmul_w(vec_dx, const_tmp); + vec1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, vec0); + + for (j = 0; j < len; j++) { + tmp0 = __lsx_vsrai_w(vec_x, 16); + reg0 = __lsx_vsrai_w(vec_x, 9); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp1 = __lsx_vsrai_w(vec_x, 16); + reg1 = __lsx_vsrai_w(vec_x, 9); + vec_x = __lsx_vadd_w(vec_x, vec1); + DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1); + DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1); + DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6); + DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7); + LOAD_DATA(src, tmp0, src0); + LOAD_DATA(src, tmp1, src1); + DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1); + LOAD_DATA(src, tmp0, src2); + LOAD_DATA(src, tmp1, src3); + DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6); + DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2; + __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B}; + __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110}; + __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0, + dst1); + dst2 = __lsx_vshuf_b(src3, src2, shuff2); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* src_nex = src_ptr + src_stride; + int x; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i tmp10, tmp11, dst0, dst1, dst2; + __m128i const0 = {0x0103030101010103, 0x0101010303010101}; + __m128i const1 = {0x0301010101030301, 0x0103030101010103}; + __m128i const2 = {0x0101010303010101, 0x0301010101030301}; + __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; + __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; + __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; + __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; + __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; + __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, + shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, + shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, + shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, + const0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, + const1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11, + const2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, + shift0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, + shift1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, + shift2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6, + tmp7, tmp8); + DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10); + DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5); + DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); + DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1); + dst2 = __lsx_vsrarni_b_h(src5, src4, 2); + __lsx_vst(dst0, d, 0); + __lsx_vst(dst1, d, 16); + __lsx_vst(dst2, d, 32); + src_ptr += 64; + src_nex += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* src_nex = src_ptr + src_stride; + int x; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i tmp10, tmp11, dst0, dst1, dst2; + __m128i const0 = {0x0103030101010103, 0x0101010303010101}; + __m128i const1 = {0x0301010101030301, 0x0103030101010103}; + __m128i const2 = {0x0101010303010101, 0x0301010101030301}; + __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; + __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; + __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; + __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; + __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; + __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, + shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, + shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, + shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, + const0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, + const1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11, + const2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, + shift0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, + shift1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, + shift2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); + DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1); + dst2 = __lsx_vsrarni_b_h(src5, src4, 1); + __lsx_vst(dst0, d, 0); + __lsx_vst(dst1, d, 16); + __lsx_vst(dst2, d, 32); + src_ptr += 64; + src_nex += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/source/scale_msa.cc b/source/scale_msa.cc new file mode 100644 index 00000000..482a521f --- /dev/null +++ b/source/scale_msa.cc @@ -0,0 +1,949 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/scale_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ + { \ + out0[0] = srcp[indx0[0]]; \ + out0[1] = srcp[indx0[1]]; \ + out0[2] = srcp[indx0[2]]; \ + out0[3] = srcp[indx0[3]]; \ + } + +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3; + v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); + vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg0 += reg2; + reg1 += reg3; + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_argb); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int32_t stepx = src_stepx * 4; + int32_t data0, data1, data2, data3; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + data0 = LW(src_argb); + data1 = LW(src_argb + stepx); + data2 = LW(src_argb + stepx * 2); + data3 = LW(src_argb + stepx * 3); + SW(data0, dst_argb); + SW(data1, dst_argb + 4); + SW(data2, dst_argb + 8); + SW(data3, dst_argb + 12); + src_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* nxt_argb = src_argb + src_stride; + int32_t stepx = src_stepx * 4; + int64_t data0, data1, data2, data3; + v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; + v16u8 vec0, vec1, vec2, vec3; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 dst0; + + for (x = 0; x < dst_width; x += 4) { + data0 = LD(src_argb); + data1 = LD(src_argb + stepx); + data2 = LD(src_argb + stepx * 2); + data3 = LD(src_argb + stepx * 3); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); + data0 = LD(nxt_argb); + data1 = LD(nxt_argb + stepx); + data2 = LD(nxt_argb + stepx * 2); + data3 = LD(nxt_argb + stepx * 3); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); + reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); + reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); + reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); + reg4 += reg6; + reg5 += reg7; + reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_argb); + src_argb += stepx * 4; + nxt_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = __msa_aver_u_b(vec1, vec0); + dst1 = __msa_aver_u_b(vec3, vec2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst, 16); + s += 64; + t += 64; + dst += 32; + } +} + +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + const uint8_t* t2 = s + src_stride * 3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); + vec0 += __msa_hadd_u_h(src0, src0); + vec1 += __msa_hadd_u_h(src1, src1); + vec2 += __msa_hadd_u_h(src2, src2); + vec3 += __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + reg0 = __msa_hadd_u_w(vec0, vec0); + reg1 = __msa_hadd_u_w(vec1, vec1); + reg2 = __msa_hadd_u_w(vec2, vec2); + reg3 = __msa_hadd_u_w(vec3, vec3); + reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); + reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); + reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); + reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + s += 64; + t0 += 64; + t1 += 64; + t2 += 64; + dst += 16; + } +} + +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, width; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, vec0; + v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; + (void)src_stride; + + assert(dst_width % 3 == 0); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); + dst0 = __msa_copy_u_d((v2i64)vec0, 0); + dst1 = __msa_copy_u_w((v4i32)vec0, 2); + SD(dst0, dst); + SW(dst1, dst + 8); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8i16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x2AAA; + tmp1 *= const_0x2AAA; + tmp4 *= const_0x4000; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, src4, src5, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8u16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); + vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); + vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); + vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x1C71; + tmp1 *= const_0x1C71; + tmp4 *= const_0x2AAA; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t0 += 32; + t1 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + v16u8 src0; + v8u16 dst0, dst1; + v16i8 zero = {0}; + + assert(src_width > 0); + + for (x = 0; x < src_width; x += 16) { + src0 = LD_UB(src_ptr); + dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); + dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); + dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + ST_UH2(dst0, dst1, dst_ptr, 8); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + v4i32 vec_x = __msa_fill_w(x); + v4i32 vec_dx = __msa_fill_w(dx); + v4i32 vec_const = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 reg0, reg1; + v16u8 dst0; + v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); + v4i32 const_0x40 = __msa_fill_w(0x40); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 16) { + vec2 = vec_x >> 16; + vec6 = vec_x & const_0xFFFF; + vec_x += vec1; + vec3 = vec_x >> 16; + vec7 = vec_x & const_0xFFFF; + vec_x += vec1; + vec4 = vec_x >> 16; + vec8 = vec_x & const_0xFFFF; + vec_x += vec1; + vec5 = vec_x >> 16; + vec9 = vec_x & const_0xFFFF; + vec_x += vec1; + vec6 >>= 9; + vec7 >>= 9; + vec8 >>= 9; + vec9 >>= 9; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); + vec2 += 1; + vec3 += 1; + vec4 += 1; + vec5 += 1; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); + tmp4 -= tmp0; + tmp5 -= tmp1; + tmp6 -= tmp2; + tmp7 -= tmp3; + tmp4 *= vec6; + tmp5 *= vec7; + tmp6 *= vec8; + tmp7 *= vec9; + tmp4 += const_0x40; + tmp5 += const_0x40; + tmp6 += const_0x40; + tmp7 += const_0x40; + tmp4 >>= 7; + tmp5 >>= 7; + tmp6 >>= 7; + tmp7 >>= 7; + tmp0 += tmp4; + tmp1 += tmp5; + tmp2 += tmp6; + tmp3 += tmp7; + reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + __msa_st_b(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + v4i32 x_vec = __msa_fill_w(x); + v4i32 dx_vec = __msa_fill_w(dx); + v4i32 const_vec = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2; + v4i32 dst0; + + vec0 = dx_vec * const_vec; + vec1 = dx_vec * 4; + x_vec += vec0; + + for (j = 0; j < dst_width; j += 4) { + vec2 = x_vec >> 16; + x_vec += vec1; + LOAD_INDEXED_DATA(src, vec2, dst0); + __msa_st_w(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + int j; + v4u32 src0, src1, src2, src3; + v4u32 vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 mult0, mult1, mult2, mult3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0, dst1; + v4u32 vec_x = (v4u32)__msa_fill_w(x); + v4u32 vec_dx = (v4u32)__msa_fill_w(dx); + v4u32 vec_const = {0, 1, 2, 3}; + v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 8) { + vec2 = vec_x >> 16; + reg0 = (v16u8)(vec_x >> 9); + vec_x += vec1; + vec3 = vec_x >> 16; + reg1 = (v16u8)(vec_x >> 9); + vec_x += vec1; + reg0 = reg0 & const_0x7f; + reg1 = reg1 & const_0x7f; + reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); + reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); + reg2 = reg0 ^ const_0x7f; + reg3 = reg1 ^ const_0x7f; + mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); + mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); + mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); + mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); + LOAD_INDEXED_DATA(src, vec2, src0); + LOAD_INDEXED_DATA(src, vec3, src1); + vec2 += 1; + vec3 += 1; + LOAD_INDEXED_DATA(src, vec2, src2); + LOAD_INDEXED_DATA(src, vec3, src3); + reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + tmp0 = __msa_dotp_u_h(reg4, mult0); + tmp1 = __msa_dotp_u_h(reg5, mult1); + tmp2 = __msa_dotp_u_h(reg6, mult2); + tmp3 = __msa_dotp_u_h(reg7, mult3); + tmp0 >>= 7; + tmp1 >>= 7; + tmp2 >>= 7; + tmp3 >>= 7; + dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + __msa_st_b(dst0, dst_argb, 0); + __msa_st_b(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + v16u8 src0, src1, src2, src3; + v16u8 vec0, vec1, vec2; + v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; + v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; + v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, + 21, 23, 24, 25, 27, 28, 29, 31}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); + __msa_st_b((v16i8)vec0, dst, 0); + __msa_st_b((v16i8)vec1, dst, 16); + __msa_st_b((v16i8)vec2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 = reg0 * 3 + reg6; + reg1 = reg1 * 3 + reg7; + reg2 = reg2 * 3 + reg8; + reg3 = reg3 * 3 + reg9; + reg4 = reg4 * 3 + reg10; + reg5 = reg5 * 3 + reg11; + reg0 = __msa_srari_h(reg0, 2); + reg1 = __msa_srari_h(reg1, 2); + reg2 = __msa_srari_h(reg2, 2); + reg3 = __msa_srari_h(reg3, 2); + reg4 = __msa_srari_h(reg4, 2); + reg5 = __msa_srari_h(reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 += reg6; + reg1 += reg7; + reg2 += reg8; + reg3 += reg9; + reg4 += reg10; + reg5 += reg11; + reg0 = __msa_srari_h(reg0, 1); + reg1 = __msa_srari_h(reg1, 1); + reg2 = __msa_srari_h(reg2, 1); + reg3 = __msa_srari_h(reg3, 1); + reg4 = __msa_srari_h(reg4, 1); + reg5 = __msa_srari_h(reg5, 1); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/source/scale_neon.cc b/source/scale_neon.cc new file mode 100644 index 00000000..ccc75106 --- /dev/null +++ b/source/scale_neon.cc @@ -0,0 +1,1533 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// NEON downscalers with interpolation. +// Provided by Fritz Koenig + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + // row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + // pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc"); +} + +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc"); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc"); +} + +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1 + 2) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", + "cc"); +} + +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); +} + +#define HAS_SCALEROWDOWN38_NEON +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, + 18, 6, 14, 19, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vld1.8 {q3}, [%3] \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile( + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", + "cc"); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); +} + +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u8 d30, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 01234567 + "vld1.8 {d5}, [%3]! \n" // 12345678 + + "vmovl.u8 q0, d4 \n" // 01234567 (16b) + "vmovl.u8 q1, d5 \n" // 12345678 (16b) + "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) + "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) + + "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.8 {d0, d1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 1; + const uint8_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 q15, #3 \n" + "vmov.u8 d28, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 01234567 + "vld1.8 {d5}, [%5]! \n" // 12345678 + + "vmovl.u8 q0, d4 \n" // 01234567 (16b) + "vmovl.u8 q1, d5 \n" // 12345678 (16b) + "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) + "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) + + "vld1.8 {d8}, [%1]! \n" + "vld1.8 {d9}, [%6]! \n" + + "vmovl.u8 q2, d8 \n" + "vmovl.u8 q3, d9 \n" + "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) + "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) + + // e o + // q1 q0 + // q3 q2 + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + // e o + // q5 q4 + // q1 q0 + + "vrshrn.u16 d2, q1, #4 \n" // 2, even + "vrshrn.u16 d3, q0, #4 \n" // 2, odd + "vrshrn.u16 d0, q5, #4 \n" // 1, even + "vrshrn.u16 d1, q4, #4 \n" // 1, odd + + "vst2.8 {d0, d1}, [%2]! \n" // store + "vst2.8 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", + "q15" // Clobber List + ); +} + +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) + + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) + + "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) + + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) + + "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) + "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) + + "vmovq q4, q2 \n" + "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd) + "vmla.u16 q3, q4, q15 \n" // 3*near+far (even) + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + "vrshr.u16 q2, q1, #4 \n" // 2, even + "vrshr.u16 q3, q0, #4 \n" // 2, odd + "vrshr.u16 q0, q5, #4 \n" // 1, even + "vrshr.u16 q1, q4, #4 \n" // 1, odd + + "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store + "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store + "subs %4, %4, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q15" // Clobber List + ); +} + +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u16 d31, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) + + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 4567 (32b) + "vmovl.u16 q4, d2 \n" // 1234 (32b) + "vmovl.u16 q5, d3 \n" // 5678 (32b) + + "vmlal.u16 q2, d2, d31 \n" + "vmlal.u16 q3, d3, d31 \n" + "vmlal.u16 q4, d0, d31 \n" + "vmlal.u16 q5, d1, d31 \n" + + "vrshrn.u32 d0, q4, #2 \n" + "vrshrn.u32 d1, q5, #2 \n" + "vrshrn.u32 d2, q2, #2 \n" + "vrshrn.u32 d3, q3, #2 \n" + + "vst2.16 {q0, q1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 d31, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.16 {d0}, [%0]! \n" // 0123 (16b) + "vld1.16 {d1}, [%5]! \n" // 1234 (16b) + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 1234 (32b) + "vmlal.u16 q2, d1, d31 \n" + "vmlal.u16 q3, d0, d31 \n" + + "vld1.16 {d0}, [%1]! \n" // 0123 (16b) + "vld1.16 {d1}, [%6]! \n" // 1234 (16b) + "vmovl.u16 q4, d0 \n" // 0123 (32b) + "vmovl.u16 q5, d1 \n" // 1234 (32b) + "vmlal.u16 q4, d1, d31 \n" + "vmlal.u16 q5, d0, d31 \n" + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" + "vmla.u32 q5, q3, q14 \n" + "vmla.u32 q2, q0, q14 \n" + "vmla.u32 q3, q1, q14 \n" + + "vrshrn.u32 d1, q4, #4 \n" + "vrshrn.u32 d0, q5, #4 \n" + "vrshrn.u32 d3, q2, #4 \n" + "vrshrn.u32 d2, q3, #4 \n" + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 sample -> 8 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d31" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 2; + asm volatile( + "vmov.u8 d30, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) + "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) + + "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) + "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) + + "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.16 {d0, d1}, [%1]! \n" // store + "subs %2, %2, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 2; + const uint8_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "vmov.u16 q15, #3 \n" + "vmov.u8 d28, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) + "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) + + "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) + "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) + + "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v) + "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v) + + "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) + "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) + + // e o + // q1 q0 + // q3 q2 + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + // e o + // q5 q4 + // q1 q0 + + "vrshrn.u16 d2, q1, #4 \n" // 2, even + "vrshrn.u16 d3, q0, #4 \n" // 2, odd + "vrshrn.u16 d0, q5, #4 \n" // 1, even + "vrshrn.u16 d1, q4, #4 \n" // 1, odd + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", + "q15" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 2; + asm volatile( + "vmov.u16 d30, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) + "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) + + "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b) + "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b) + "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b) + "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd) + "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even) + "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd) + "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even) + + "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even) + "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.32 {d0, d1}, [%1]! \n" // store + "vst2.32 {d2, d3}, [%1]! \n" // store + "subs %2, %2, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "d30" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 2; + const uint16_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "vmov.u16 d30, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) + "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) + "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b) + "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd) + "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even) + + "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v) + "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v) + "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b) + "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd) + "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even) + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd) + "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even) + "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd) + "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even) + + "vrshrn.u32 d1, q4, #4 \n" // 1, odd + "vrshrn.u32 d0, q5, #4 \n" // 1, even + "vrshrn.u32 d3, q2, #4 \n" // 2, odd + "vrshrn.u32 d2, q3, #4 \n" // 2, even + + "vst2.32 {d0, d1}, [%2]! \n" // store + "vst2.32 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #4 \n" // 2 uv -> 4 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d30" // Clobber List + ); +} + +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + "1: \n" + "vld1.16 {q1, q2}, [%1] \n" // load accumulator + "vld1.8 {q0}, [%0]! \n" // load 16 bytes + "vaddw.u8 q2, q2, d1 \n" // add + "vaddw.u8 q1, q1, d0 \n" + "vst1.16 {q1, q2}, [%1]! \n" // store accumulator + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" + +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_ptr; + asm volatile ( + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q1, q1, q0 \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vrshrn.s32 d18, q11, #16 \n" + "vrshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13" + ); +} + +#undef LOAD2_DATA8_LANE + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); +} + +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! +// 4a: 3e04 subs r6, #4 +// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! +// 50: ef64 21f4 vorr q9, q10, q10 +// 54: f942 038d vst2.32 {d16-d19}, [r2]! +// 58: d1f5 bne.n 46 + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "mov r12, %3, lsl #2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld1.32 {" #dn "[" #n "]}, [%6] \n" + +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + int tmp; + const uint8_t* src_tmp = src_argb; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + // clang-format on + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "=&r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1"); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_argb; + asm volatile ( + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q8, q1, q0 \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(d0, d2, 0) + LOAD2_DATA32_LANE(d0, d2, 1) + LOAD2_DATA32_LANE(d1, d3, 0) + LOAD2_DATA32_LANE(d1, d3, 1) + "vshrn.i32 d22, q8, #9 \n" + "vand.16 d22, d22, d30 \n" + "vdup.8 d24, d22[0] \n" + "vdup.8 d25, d22[2] \n" + "vdup.8 d26, d22[4] \n" + "vdup.8 d27, d22[6] \n" + "vext.8 d4, d24, d25, #4 \n" + "vext.8 d5, d26, d27, #4 \n" // f + "veor.8 q10, q2, q3 \n" // 0x7f ^ f + "vmull.u8 q11, d0, d20 \n" + "vmull.u8 q12, d1, d21 \n" + "vmull.u8 q13, d2, d4 \n" + "vmull.u8 q14, d3, d5 \n" + "vadd.i16 q11, q11, q13 \n" + "vadd.i16 q12, q12, q14 \n" + "vshrn.i16 d0, q11, #7 \n" + "vshrn.i16 d1, q12, #7 \n" + + "vst1.32 {d0, d1}, [%0]! \n" // store pixels + "vadd.s32 q8, q8, q9 \n" + "subs %2, %2, #4 \n" // 4 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#undef LOAD2_DATA32_LANE + +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.16 {q1}, [%1]! \n" // store 8 UV + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1"); +} + +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %2, %2, #8 \n" // 8 processed per loop. + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.16 {q0}, [%1]! \n" // store 8 UV + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1"); +} + +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. + "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV + "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV + "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vst2.8 {d0, d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q8", "q9"); +} + +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "vld1.16 {d0[0]}, [%0], %6 \n" + "vld1.16 {d0[1]}, [%1], %6 \n" + "vld1.16 {d0[2]}, [%2], %6 \n" + "vld1.16 {d0[3]}, [%3], %6 \n" + "subs %5, %5, #4 \n" // 4 pixels per loop. + "vst1.8 {d0}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"(src_stepx * 8) // %6 + : "memory", "cc", "d0"); +} + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc new file mode 100644 index 00000000..ad06ee83 --- /dev/null +++ b/source/scale_neon64.cc @@ -0,0 +1,1673 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uadalp v0.8h, v3.16b \n" + "prfm pldl1keep, [%4, 448] \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" + + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + // (3 * line_0 + line_1 + 2) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "memory", "cc"); +} + +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); +} + +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, + 34, 6, 22, 35, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; + ptrdiff_t tmp_src_stride = src_stride; + + asm volatile( + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" + + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" + + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" + + // Align for table lookup, vtbl requires registers to be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", + "memory", "cc"); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + // TODO(fbarchard): use src_stride directly for clang 3.5+. + ptrdiff_t tmp_src_stride = src_stride; + asm volatile( + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v30", "v31", "memory", "cc"); +} + +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8b, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 01234567 + "ldr d1, [%1], #8 \n" // 12345678 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + + "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 1; + const uint8_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.8b, #3 \n" + "movi v30.8h, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 01234567 + "ldr d1, [%2], #8 \n" // 12345678 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b) + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) + + "rshrn v2.8b, v2.8h, #4 \n" // 2, odd + "rshrn v1.8b, v3.8h, #4 \n" // 2, even + "rshrn v4.8b, v4.8h, #4 \n" // 1, odd + "rshrn v3.8b, v5.8h, #4 \n" // 1, even + + "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1 + "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2 + "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "mov v2.16b, v0.16b \n" + "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) + "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) + + "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd) + "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "mov v0.16b, v2.16b \n" + "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) + "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) + + "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b) + "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "mov v0.16b, v4.16b \n" + "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) + "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even) + + "urshr v2.8h, v2.8h, #4 \n" // 2, odd + "urshr v1.8h, v3.8h, #4 \n" // 2, even + "urshr v4.8h, v4.8h, #4 \n" // 1, odd + "urshr v3.8h, v5.8h, #4 \n" // 1, even + + "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1 + "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2 + + "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v31" // Clobber List + ); +} + +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) + "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) + "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) + + "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) + + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 0123 (16b) + "ldr d1, [%2], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" // 0123 (16b) + "ldr d1, [%3], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far + + "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 + "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 + + "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 2; + asm volatile( + "movi v31.8b, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 00112233 (1u1v) + "ldr d1, [%1], #8 \n" // 11223344 (1u1v) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b) + "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b) + + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + + "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store + "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 2; + const uint8_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "movi v31.8b, #3 \n" + "movi v30.8h, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" + "ldr d1, [%2], #8 \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" + "ushll v3.8h, v1.8b, #0 \n" + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "ushll v4.8h, v0.8b, #0 \n" + "ushll v5.8h, v1.8b, #0 \n" + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) + + "rshrn v2.8b, v2.8h, #4 \n" // 2, odd + "rshrn v1.8b, v3.8h, #4 \n" // 2, even + "rshrn v4.8b, v4.8h, #4 \n" // 1, odd + "rshrn v3.8b, v5.8h, #4 \n" // 1, even + + "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2 + "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1 + "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 2; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even) + "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even) + + "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store + "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store + "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 2; + const uint16_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" + "ldr d1, [%2], #8 \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v2.4s, #4 \n" // 2, odd + "rshrn v0.4h, v3.4s, #4 \n" // 2, even + "rshrn v3.4h, v4.4s, #4 \n" // 1, odd + "rshrn v2.4h, v5.4s, #4 \n" // 1, even + + "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2 + "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1 + "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + "1: \n" + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {v4.b, v5.b}[" #n "], [%6] \n" + +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_ptr; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v1.4s, v1.4s, v0.4s \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v16", "v17" + ); +} + +#undef LOAD2_DATA8_LANE + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction), // %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); +} + +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64_t)(src_stepx * 4)) // %3 + : "memory", "cc", "v0"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +// TODO(Yang Zhang): Might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "prfm pldl1keep, [%1, 448] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64_t)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld1 {" #vn ".s}[" #n "], [%6] \n" + +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + int64_t tmp64; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + // clang-format on + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1"); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(v0, v1, 0) + LOAD2_DATA32_LANE(v0, v1, 1) + LOAD2_DATA32_LANE(v0, v1, 2) + LOAD2_DATA32_LANE(v0, v1, 3) + "shrn v2.4h, v5.4s, #9 \n" + "and v2.8b, v2.8b, v4.8b \n" + "dup v16.8b, v2.b[0] \n" + "dup v17.8b, v2.b[2] \n" + "dup v18.8b, v2.b[4] \n" + "dup v19.8b, v2.b[6] \n" + "ext v2.8b, v16.8b, v17.8b, #4 \n" + "ext v17.8b, v18.8b, v19.8b, #4 \n" + "ins v2.d[1], v17.d[0] \n" // f + "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f + "umull v16.8h, v0.8b, v7.8b \n" + "umull2 v17.8h, v0.16b, v7.16b \n" + "umull v18.8h, v1.8b, v2.8b \n" + "umull2 v19.8h, v1.16b, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "add v16.8h, v16.8h, v18.8h \n" + "add v17.8h, v17.8h, v19.8h \n" + "shrn v0.8b, v16.8h, #7 \n" + "shrn2 v0.16b, v17.8h, #7 \n" + "st1 {v0.4s}, [%0], #16 \n" // store pixels + "add v5.4s, v5.4s, v6.4s \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v16", "v17", "v18", "v19" + ); +} + +#undef LOAD2_DATA32_LANE + +// Read 16x2 average down and write 8x1. +void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "1: \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Read 8x2 upsample with filtering and write 16x1. +// Actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" + + "1: \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : "r"(2LL), // %4 + "r"(14LL) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19" // Clobber List + ); +} + +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.8h}, [%1], #16 \n" // store 8 UV + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1"); +} + +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.8h}, [%1], #16 \n" // store 8 UV + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1"); +} + +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. + "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 + "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "prfm pldl1keep, [%1, 448] \n" + "rshrn v1.8b, v1.8h, #2 \n" + "st2 {v0.8b,v1.8b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v16", "v17"); +} + +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.h}[0], [%0], %6 \n" + "ld1 {v1.h}[0], [%1], %6 \n" + "ld1 {v2.h}[0], [%2], %6 \n" + "ld1 {v3.h}[0], [%3], %6 \n" + "subs %w5, %w5, #4 \n" // 4 pixels per loop. + "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"((int64_t)(src_stepx * 8)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_rgb.cc b/source/scale_rgb.cc new file mode 100644 index 00000000..8db59b56 --- /dev/null +++ b/source/scale_rgb.cc @@ -0,0 +1,66 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" /* For FilterMode */ + +#include +#include + +#include "libyuv/convert_argb.h" +#include "libyuv/convert_from_argb.h" +#include "libyuv/row.h" +#include "libyuv/scale_argb.h" +#include "libyuv/scale_rgb.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Scale a 24 bit image. +// Converts to ARGB as intermediate step + +LIBYUV_API +int RGBScale(const uint8_t* src_rgb, + int src_stride_rgb, + int src_width, + int src_height, + uint8_t* dst_rgb, + int dst_stride_rgb, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int r; + uint8_t* src_argb = + (uint8_t*)malloc(src_width * src_height * 4 + dst_width * dst_height * 4); + uint8_t* dst_argb = src_argb + src_width * src_height * 4; + + if (!src_argb) { + return 1; + } + + r = RGB24ToARGB(src_rgb, src_stride_rgb, src_argb, src_width * 4, src_width, + src_height); + if (!r) { + r = ARGBScale(src_argb, src_width * 4, src_width, src_height, dst_argb, + dst_width * 4, dst_width, dst_height, filtering); + if (!r) { + r = ARGBToRGB24(dst_argb, dst_width * 4, dst_rgb, dst_stride_rgb, + dst_width, dst_height); + } + } + free(src_argb); + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_uv.cc b/source/scale_uv.cc new file mode 100644 index 00000000..1556071d --- /dev/null +++ b/source/scale_uv.cc @@ -0,0 +1,1171 @@ +/* + * Copyright 2020 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyUV +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Macros to enable specialized scalers + +#ifndef HAS_SCALEUVDOWN2 +#define HAS_SCALEUVDOWN2 1 +#endif +#ifndef HAS_SCALEUVDOWN4BOX +#define HAS_SCALEUVDOWN4BOX 1 +#endif +#ifndef HAS_SCALEUVDOWNEVEN +#define HAS_SCALEUVDOWNEVEN 1 +#endif +#ifndef HAS_SCALEUVBILINEARDOWN +#define HAS_SCALEUVBILINEARDOWN 1 +#endif +#ifndef HAS_SCALEUVBILINEARUP +#define HAS_SCALEUVBILINEARUP 1 +#endif +#ifndef HAS_UVCOPY +#define HAS_UVCOPY 1 +#endif +#ifndef HAS_SCALEPLANEVERTICAL +#define HAS_SCALEPLANEVERTICAL 1 +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleUV, 1/2 +// This is an optimized version for scaling down a UV to 1/2 of +// its original size. +#if HAS_SCALEUVDOWN2 +static void ScaleUVDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + filtering == kFilterNone + ? ScaleUVRowDown2_C + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C + : ScaleUVRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; + } else { + src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2; + } + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON + : ScaleUVRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON + : ScaleUVRowDown2Box_NEON); + } + } +#endif + +// This code is not enabled. Only box filter is available at this time. +#if defined(HAS_SCALEUVROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3 + : ScaleUVRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3 + : ScaleUVRowDown2Box_SSSE3); + } + } +#endif + +#if defined(HAS_SCALEUVROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA + : ScaleUVRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA + : ScaleUVRowDown2Box_MSA); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif // HAS_SCALEUVDOWN2 + +// ScaleUV, 1/4 +// This is an optimized version for scaling down a UV to 1/4 of +// its original size. +#if HAS_SCALEUVDOWN4BOX +static void ScaleUVDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + // Allocate 2 rows of UV. + const int row_size = (dst_width * 2 * 2 + 15) & ~15; + align_buffer_64(row, row_size * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + ScaleUVRowDown2Box_C; + // Advance to odd row, even column. + src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); + ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + row_size, + dst_width * 2); + ScaleUVRowDown2(row, row_size, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } + free_aligned_buffer_64(row); +} +#endif // HAS_SCALEUVDOWN4BOX + +// ScaleUV Even +// This is an optimized version for scaling down a UV to even +// multiple of its original size. +#if HAS_SCALEUVDOWNEVEN +static void ScaleUVDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride); + void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, + int src_step, uint8_t* dst_uv, int dst_width) = + filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; + (void)src_width; + (void)src_height; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; +#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 + : ScaleUVRowDownEven_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && !filtering) { + ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = ScaleUVRowDownEven_NEON; + } + } +#endif // TODO(fbarchard): Enable Box filter +#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON + : ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif + +// Scale UV down with bilinear interpolation. +#if HAS_SCALEUVBILINEARDOWN +static void ScaleUVBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. + src_uv += xl * 2; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of UV. + { + align_buffer_64(row, clip_src_width * 2); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * (intptr_t)src_stride; + if (filtering == kFilterLinear) { + ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleUVFilterCols(dst_uv, row, dst_width, x, dx); + } + dst_uv += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} +#endif + +// Scale UV up with bilinear interpolation. +#if HAS_SCALEUVBILINEARUP +static void ScaleUVBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + if (src_width >= 32768) { + ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; + } +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_SSSE3) + if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + ScaleUVFilterCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleUVFilterCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * (intptr_t)src_stride; + + // Allocate 2 rows of UV. + const int row_size = (dst_width * 2 + 15) & ~15; + align_buffer_64(row, row_size * 2); + + uint8_t* rowptr = row; + int rowstride = row_size; + int lasty = yi; + + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx); + if (src_height > 2) { + src += src_stride; + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_uv + yi * (intptr_t)src_stride; + } + if (yi != lasty) { + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + if ((y + 65536) < max_y) { + src += src_stride; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf); + } + dst_uv += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} +#endif // HAS_SCALEUVBILINEARUP + +// Scale UV, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of NV16 to NV24. +void ScaleUVLinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv) { + void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width); + dst_uv += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of NV12 to NV24. +void ScaleUVBilinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale 16 bit UV, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of P210 to P410. +void ScaleUVLinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_uv, + uint16_t* dst_uv) { + void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width); + dst_uv += dst_stride; + y += dy; + } + } +} + +// Scale 16 bit UV, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of P010 to P410. +void ScaleUVBilinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale UV to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleUVSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, + int x, int dx) = + (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; + (void)src_height; +#if defined(HAS_SCALEUVCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVCols = ScaleUVCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVCols = ScaleUVCols_MSA; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleUVCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVColsUp2_SSSE3; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x, + dx); + dst_uv += dst_stride; + y += dy; + } +} + +// Copy UV with optional flipping +#if HAS_UVCOPY +static int UVCopy(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv; + src_stride_uv = -src_stride_uv; + } + + CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height); + return 0; +} + +static int UVCopy_16(const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv; + src_stride_uv = -src_stride_uv; + } + + CopyPlane_16(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height); + return 0; +} +#endif // HAS_UVCOPY + +// Scale a UV plane (from NV12) +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleUV(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (intptr_t)src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64_t clipf = (int64_t)(clip_x)*dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 2; + dst += clip_x * 2; + } + if (clip_y) { + int64_t clipf = (int64_t)(clip_y)*dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * (intptr_t)src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { +#if HAS_SCALEUVDOWN2 + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleUVDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#if HAS_SCALEUVDOWN4BOX + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleUVDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); + return; + } +#endif +#if HAS_SCALEUVDOWNEVEN + ScaleUVDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; +#endif + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; +#ifdef HAS_UVCOPY + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2, + src_stride, dst, dst_stride, clip_width, clip_height); + return; + } +#endif + } + } + } + // HAS_SCALEPLANEVERTICAL + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering); + return; + } + if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) { + ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst); + return; + } + if ((clip_height + 1) / 2 == src_height && + (clip_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst); + return; + } +#if HAS_SCALEUVBILINEARUP + if (filtering && dy < 65536) { + ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#if HAS_SCALEUVBILINEARDOWN + if (filtering) { + ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif + ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); +} + +// Scale an UV image. +LIBYUV_API +int UVScale(const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv, + dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); + return 0; +} + +// Scale a 16 bit UV image. +// This function is currently incomplete, it can't handle all cases. +LIBYUV_API +int UVScale_16(const uint16_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint16_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int dy = 0; + + if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv; + src_stride_uv = -src_stride_uv; + } + src_width = Abs(src_width); + +#ifdef HAS_UVCOPY + if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { + if (dst_height == 1) { + UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv, + src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height); + } else { + dy = src_height / dst_height; + UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv, + (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv, + dst_width, dst_height); + } + + return 0; + } +#endif + + if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) { + ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height, + src_stride_uv, dst_stride_uv, src_uv, dst_uv); + return 0; + } + + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height, + src_stride_uv, dst_stride_uv, src_uv, dst_uv); + return 0; + } + + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_win.cc b/source/scale_win.cc new file mode 100644 index 00000000..ea1f95c6 --- /dev/null +++ b/source/scale_win.cc @@ -0,0 +1,1392 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) + +// Offsets for source bytes 0 to 9 +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 0 to 10 +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; + +// Coefficients for source bytes 0 to 10 +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; + +// Coefficients for source bytes 10 to 21 +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; + +// Coefficients for source bytes 21 to 31 +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; + +// Coefficients for source bytes 21 to 31 +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; + +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 0,1,2 +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 3,4,5 +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x3 and 2x3 +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; + +// Arrange first value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; + +// Arrange second value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; + +// Arrange third value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x2 and 2x2 +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; + +// Reads 32 pixels, throws half away and writes 16 pixels. +__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add + paddw xmm1, xmm3 + psrlw xmm0, 1 + psrlw xmm1, 1 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop esi + ret + } +} + +#ifdef HAS_SCALEROWDOWN2_AVX2 +// Reads 64 pixels, throws half away and writes 32 pixels. +__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x1 rectangle to 32x1. +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// For rounding, average = (sum + 2) / 4 +// becomes average((sum >> 1), 0) +// Blends 64x2 rectangle to 32x1. +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] + lea eax, [eax + 64] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 + vpsrlw ymm1, ymm1, 1 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN2_AVX2 + +// Point samples 32 pixels to 8 pixels. +__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + movdqa xmm5, xmm4 + packuswb xmm4, xmm4 + psllw xmm5, 3 // constant 0x0008 + + wloop: + movdqu xmm0, [eax] // average rows + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm1, xmm3 + movdqu xmm2, [eax + esi * 2] + movdqu xmm3, [eax + esi * 2 + 16] + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 2 + paddw xmm1, xmm3 + movdqu xmm2, [eax + edi] + movdqu xmm3, [eax + edi + 16] + lea eax, [eax + 32] + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 3 + paddw xmm1, xmm3 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg wloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_SCALEROWDOWN4_AVX2 +// Point samples 64 pixels to 16 pixels. +__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + vpsrld ymm5, ymm5, 24 + vpslld ymm5, ymm5, 16 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x4 rectangle to 16x1. +__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpsrlw ymm4, ymm4, 15 + vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpackuswb ymm4, ymm4, ymm4 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm1, ymm1, ymm3 + vmovdqu ymm2, [eax + esi * 2] + vmovdqu ymm3, [eax + esi * 2 + 32] + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm1, ymm1, ymm3 + vmovdqu ymm2, [eax + edi] + vmovdqu ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm1, ymm1, ymm3 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN4_AVX2 + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, xmmword ptr kShuf0 + movdqa xmm4, xmmword ptr kShuf1 + movdqa xmm5, xmmword ptr kShuf2 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShuf01 + movdqa xmm3, xmmword ptr kShuf11 + movdqa xmm4, xmmword ptr kShuf21 + movdqa xmm5, xmmword ptr kMadd01 + movdqa xmm6, xmmword ptr kMadd11 + movdqa xmm7, xmmword ptr kRound34 + + wloop: + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, xmmword ptr kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShuf01 + movdqa xmm3, xmmword ptr kShuf11 + movdqa xmm4, xmmword ptr kShuf21 + movdqa xmm5, xmmword ptr kMadd01 + movdqa xmm6, xmmword ptr kMadd11 + movdqa xmm7, xmmword ptr kRound34 + + wloop: + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, xmmword ptr kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, xmmword ptr kShuf38a + movdqa xmm5, xmmword ptr kShuf38b + + xloop: + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + sub ecx, 12 + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShufAc + movdqa xmm3, xmmword ptr kShufAc3 + movdqa xmm4, xmmword ptr kScaleAc33 + pxor xmm5, xmm5 + + xloop: + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqu xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + sub ecx, 6 + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShufAb0 + movdqa xmm3, xmmword ptr kShufAb1 + movdqa xmm4, xmmword ptr kShufAb2 + movdqa xmm5, xmmword ptr kScaleAb2 + + xloop: + movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm1, [eax + esi] + lea eax, [eax + 16] + pavgb xmm0, xmm1 + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + sub ecx, 6 + jg xloop + + pop esi + ret + } +} + +// Reads 16 bytes and accumulates to 16 shorts at a time. +__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + pxor xmm5, xmm5 + + // sum rows + xloop: + movdqu xmm3, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm1, [edx + 16] + movdqa xmm2, xmm3 + punpcklbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 16 + jg xloop + ret + } +} + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + vpxor ymm5, ymm5, ymm5 + + // sum rows + xloop: + vmovdqu ymm3, [eax] // read 32 bytes + lea eax, [eax + 32] + vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck + vpunpcklbw ymm2, ymm3, ymm5 + vpunpckhbw ymm3, ymm3, ymm5 + vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm1, ymm3, [edx + 32] + vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 32 + jg xloop + + vzeroupper + ret + } +} +#endif // HAS_SCALEADDROW_AVX2 + +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// Constant for making pixels unsigned and adding .5 for rounding. +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; + +// Bilinear column filtering. SSSE3 version. +__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pcmpeqb xmm7, xmm7 // generate 0x0001 + psrlw xmm7, 15 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. + pxor xmm1, xmm6 // 0..7f and 7f..0 + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. + movd ebx, xmm1 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + xloop29: + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. + pxor xmm2, xmm6 // 0..7f and 7f..0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit + paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits + movd ebx, xmm2 + mov [edi], bl + + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + wloop: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 // 4 pixels + jge xloop4 + + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, xmmword ptr kShuffleColARGB + movdqa xmm5, xmmword ptr kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + wloop: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg wloop + + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) int FixedDiv1_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + sub eax, 0x00010001 + sbb edx, 0 + sub ecx, 1 + idiv ecx + ret + } +} +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/test.sh b/source/test.sh new file mode 100755 index 00000000..7f12c3c1 --- /dev/null +++ b/source/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -x + +function runbenchmark1 { + perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 + perf report | grep AVX +} + +runbenchmark1 ABGRToI420 +runbenchmark1 Android420ToI420 +runbenchmark1 ARGBToI420 +runbenchmark1 Convert16To8Plane +runbenchmark1 ConvertToARGB +runbenchmark1 ConvertToI420 +runbenchmark1 CopyPlane +runbenchmark1 H010ToAB30 +runbenchmark1 H010ToAR30 +runbenchmark1 HalfFloatPlane +runbenchmark1 I010ToAB30 +runbenchmark1 I010ToAR30 +runbenchmark1 I420Copy +runbenchmark1 I420Psnr +runbenchmark1 I420Scale +runbenchmark1 I420Ssim +runbenchmark1 I420ToARGB +runbenchmark1 I420ToNV12 +runbenchmark1 I420ToUYVY +runbenchmark1 I422ToI420 +runbenchmark1 InitCpuFlags +runbenchmark1 J420ToARGB +runbenchmark1 NV12ToARGB +runbenchmark1 NV12ToI420 +runbenchmark1 NV12ToI420Rotate +runbenchmark1 SetCpuFlags +runbenchmark1 YUY2ToI420 diff --git a/source/video_common.cc b/source/video_common.cc new file mode 100644 index 00000000..92384c05 --- /dev/null +++ b/source/video_common.cc @@ -0,0 +1,62 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +struct FourCCAliasEntry { + uint32_t alias; + uint32_t canonical; +}; + +#define NUM_ALIASES 18 +static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +}; +// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. +// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA + +LIBYUV_API +uint32_t CanonicalFourCC(uint32_t fourcc) { + int i; + for (i = 0; i < NUM_ALIASES; ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif -- cgit v1.2.3