diff options
33 files changed, 9572 insertions, 31 deletions
@@ -69,6 +69,10 @@ group("libyuv") { deps += [ ":libyuv_msa" ] } + if (libyuv_use_mmi) { + deps += [ ":libyuv_mmi" ] + } + if (!is_ios) { # Make sure that clients of libyuv link with libjpeg. This can't go in # libyuv_internal because in Windows x64 builds that will generate a clang @@ -229,6 +233,24 @@ if (libyuv_use_msa) { } } +if (libyuv_use_mmi) { + static_library("libyuv_mmi") { + sources = [ + # MMI Source Files + "source/compare_mmi.cc", + "source/rotate_mmi.cc", + "source/row_mmi.cc", + "source/scale_mmi.cc", + ] + + deps = [ + ":libyuv_internal", + ] + + public_configs = [ ":libyuv_config" ] + } +} + if (libyuv_include_tests) { config("libyuv_unittest_warnings_config") { if (!is_win) { diff --git a/docs/getting_started.md b/docs/getting_started.md index f547c419..bb3226e5 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -178,6 +178,15 @@ Running test with C code: ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest +### MIPS Linux + +mips + + gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false is_clang=false use_sysroot=false use_gold=false" + gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false is_clang=false use_sysroot=false use_gold=false" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest + ## Building the Library with make ### Linux diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index e81f7455..e95b9d93 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -84,6 +84,11 @@ extern "C" { #define HAS_SUMSQUAREERROR_MSA #endif +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +#define HAS_HAMMINGDISTANCE_MMI +#define HAS_SUMSQUAREERROR_MMI +#endif + uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count); @@ -102,7 +107,9 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int count); - +uint32_t HammingDistance_MMI(const uint8_t* src_a, + const uint8_t* src_b, + int count); uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count); @@ -118,6 +125,9 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, uint32_t SumSquareError_MSA(const uint8_t* src_a, const uint8_t* src_b, int count); +uint32_t SumSquareError_MMI(const uint8_t* src_a, + const uint8_t* src_b, + int count); uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed); diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 0229cb5e..b01cd25c 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -48,6 +48,7 @@ static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; // These flags are only valid on MIPS processors. static const int kCpuHasMIPS = 0x200000; static const int kCpuHasMSA = 0x400000; +static const int kCpuHasMMI = 0x800000; // Optional init function. TestCpuFlag does an auto-init. // Returns cpu_info flags. diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index 5edc0fcf..022293ee 100644..100755 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -60,6 +60,11 @@ extern "C" { #define HAS_TRANSPOSEUVWX16_MSA #endif +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +#define HAS_TRANSPOSEWX8_MMI +#define HAS_TRANSPOSEUVWX8_MMI +#endif + void TransposeWxH_C(const uint8_t* src, int src_stride, uint8_t* dst, @@ -87,6 +92,11 @@ void TransposeWx8_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width); +void TransposeWx8_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); void TransposeWx8_Fast_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, @@ -108,6 +118,11 @@ void TransposeWx8_Any_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width); +void TransposeWx8_Any_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, @@ -156,6 +171,13 @@ void TransposeUVWx8_NEON(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); void TransposeUVWx16_MSA(const uint8_t* src, int src_stride, uint8_t* dst_a, @@ -178,6 +200,13 @@ void TransposeUVWx8_Any_NEON(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_Any_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); void TransposeUVWx16_Any_MSA(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 1468f4b9..06ab3756 100644..100755 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -483,6 +483,81 @@ extern "C" { #define HAS_YUY2TOYROW_MSA #endif +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +#define HAS_ABGRTOUVROW_MMI +#define HAS_ABGRTOYROW_MMI +#define HAS_ARGB1555TOARGBROW_MMI +#define HAS_ARGB1555TOUVROW_MMI +#define HAS_ARGB1555TOYROW_MMI +#define HAS_ARGB4444TOARGBROW_MMI +#define HAS_ARGB4444TOUVROW_MMI +#define HAS_ARGB4444TOYROW_MMI +#define HAS_ARGBADDROW_MMI +#define HAS_ARGBATTENUATEROW_MMI +#define HAS_ARGBBLENDROW_MMI +#define HAS_ARGBCOLORMATRIXROW_MMI +#define HAS_ARGBCOPYALPHAROW_MMI +#define HAS_ARGBCOPYYTOALPHAROW_MMI +#define HAS_ARGBEXTRACTALPHAROW_MMI +#define HAS_ARGBGRAYROW_MMI +#define HAS_ARGBMIRRORROW_MMI +#define HAS_ARGBMULTIPLYROW_MMI +#define HAS_ARGBSEPIAROW_MMI +#define HAS_ARGBSHADEROW_MMI +#define HAS_ARGBSHUFFLEROW_MMI +#define HAS_ARGBSUBTRACTROW_MMI +#define HAS_ARGBTOARGB1555ROW_MMI +#define HAS_ARGBTOARGB4444ROW_MMI +#define HAS_ARGBTORAWROW_MMI +#define HAS_ARGBTORGB24ROW_MMI +#define HAS_ARGBTORGB565DITHERROW_MMI +#define HAS_ARGBTORGB565ROW_MMI +#define HAS_ARGBTOUV444ROW_MMI +#define HAS_ARGBTOUVJROW_MMI +#define HAS_ARGBTOUVROW_MMI +#define HAS_ARGBTOYJROW_MMI +#define HAS_ARGBTOYROW_MMI +#define HAS_BGRATOUVROW_MMI +#define HAS_BGRATOYROW_MMI +#define HAS_BLENDPLANEROW_MMI +#define HAS_COMPUTECUMULATIVESUMROW_MMI +#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI +#define HAS_HALFFLOATROW_MMI +#define HAS_I400TOARGBROW_MMI +#define HAS_I422TOUYVYROW_MMI +#define HAS_I422TOYUY2ROW_MMI +#define HAS_INTERPOLATEROW_MMI +#define HAS_J400TOARGBROW_MMI +#define HAS_MERGERGBROW_MMI +#define HAS_MERGEUVROW_MMI +#define HAS_MIRRORROW_MMI +#define HAS_MIRRORUVROW_MMI +#define HAS_RAWTOARGBROW_MMI +#define HAS_RAWTORGB24ROW_MMI +#define HAS_RAWTOUVROW_MMI +#define HAS_RAWTOYROW_MMI +#define HAS_RGB24TOARGBROW_MMI +#define HAS_RGB24TOUVROW_MMI +#define HAS_RGB24TOYROW_MMI +#define HAS_RGB565TOARGBROW_MMI +#define HAS_RGB565TOUVROW_MMI +#define HAS_RGB565TOYROW_MMI +#define HAS_RGBATOUVROW_MMI +#define HAS_RGBATOYROW_MMI +#define HAS_SOBELROW_MMI +#define HAS_SOBELTOPLANEROW_MMI +#define HAS_SOBELXROW_MMI +#define HAS_SOBELXYROW_MMI +#define HAS_SOBELYROW_MMI +#define HAS_SPLITRGBROW_MMI +#define HAS_SPLITUVROW_MMI +#define HAS_UYVYTOUVROW_MMI +#define HAS_UYVYTOYROW_MMI +#define HAS_YUY2TOUV422ROW_MMI +#define HAS_YUY2TOUVROW_MMI +#define HAS_YUY2TOYROW_MMI +#endif + #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(VISUALC_HAS_AVX2) #define SIMD_ALIGNED(var) __declspec(align(32)) var @@ -837,6 +912,8 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -855,6 +932,15 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_MMI(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_MMI(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -940,6 +1026,51 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -959,6 +1090,17 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width); + void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); @@ -1001,6 +1143,20 @@ void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToUVRow_AVX2(const uint8_t* src_argb0, int src_stride_argb, @@ -1090,6 +1246,15 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -1175,6 +1340,51 @@ void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, uint8_t* dst_u, @@ -1254,12 +1464,14 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, @@ -1273,6 +1485,10 @@ void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void MirrorUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -1282,6 +1498,7 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -1293,6 +1510,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, @@ -1314,6 +1532,10 @@ void SplitUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -1330,6 +1552,10 @@ void SplitUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, @@ -1351,6 +1577,10 @@ void MergeUVRow_MSA(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); +void MergeUVRow_MMI(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); void MergeUVRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -1367,6 +1597,10 @@ void MergeUVRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void MergeUVRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, @@ -1383,6 +1617,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); +void SplitRGBRow_MMI(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -1393,6 +1632,11 @@ void SplitRGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_g, uint8_t* dst_b, int width); +void SplitRGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); void MergeRGBRow_C(const uint8_t* src_r, const uint8_t* src_g, @@ -1409,6 +1653,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); +void MergeRGBRow_MMI(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -1419,6 +1668,11 @@ void MergeRGBRow_Any_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); +void MergeRGBRow_Any_MMI(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, @@ -1497,12 +1751,16 @@ void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, @@ -1517,6 +1775,9 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, uint8_t* dst_a, int width); +void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, + uint8_t* dst_a, + int width); void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1529,16 +1790,23 @@ void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr, void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void SetRow_C(uint8_t* dst, uint8_t v8, int width); void SetRow_MSA(uint8_t* dst, uint8_t v8, int width); @@ -1576,6 +1844,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); +void ARGBShuffleRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, @@ -1592,6 +1864,10 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); +void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, @@ -1615,28 +1891,40 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); +void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); +void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); +void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); @@ -1687,24 +1975,35 @@ void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr, void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1712,6 +2011,9 @@ void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); @@ -1780,6 +2082,20 @@ void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, const uint32_t dither4, int width); +void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); + void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -1793,6 +2109,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -1804,6 +2121,7 @@ void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void I444ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, @@ -2324,6 +2642,7 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width); void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width); void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2334,6 +2653,7 @@ void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, @@ -2348,6 +2668,10 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); +void ARGBBlendRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBBlendRow_C(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -2374,6 +2698,16 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void BlendPlaneRow_MMI(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void BlendPlaneRow_C(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, @@ -2418,6 +2752,14 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB add images. void ARGBAddRow_C(const uint8_t* src_argb0, @@ -2456,6 +2798,14 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBAddRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. @@ -2495,6 +2845,14 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBSubtractRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2584,6 +2942,24 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, const uint32_t param, int width); +void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); + void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2770,15 +3146,25 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width); void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_C(const uint8_t* src_yuy2, int src_stride_yuy2, @@ -2820,15 +3206,25 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_v, int width); void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, @@ -2870,15 +3266,25 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width); void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_MSA(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUVRow_MMI(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_C(const uint8_t* src_uyvy, @@ -2921,15 +3327,25 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_v, int width); void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void I422ToYUY2Row_C(const uint8_t* src_y, const uint8_t* src_u, @@ -3006,21 +3422,41 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width); +void I422ToYUY2Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); void I422ToUYVYRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width); +void I422ToUYVYRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); // Effects related row functions. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); @@ -3036,6 +3472,9 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3048,6 +3487,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); // Inverse table for unattenuate, shared by C and SSE2. extern const uint32_t fixed_invtbl8[256]; @@ -3071,11 +3513,13 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBSepiaRow_C(uint8_t* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); +void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width); void ARGBColorMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_argb, @@ -3093,6 +3537,10 @@ void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); +void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); void ARGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, @@ -3145,6 +3593,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); +void ARGBShadeRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); // Used for blur. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, @@ -3158,6 +3610,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, const int32_t* previous_cumsum, int width); +void ComputeCumulativeSumRow_MMI(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); + void CumulativeSumToAverageRow_C(const int32_t* tl, const int32_t* bl, int w, @@ -3208,6 +3665,11 @@ void InterpolateRow_MSA(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); +void InterpolateRow_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); void InterpolateRow_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, @@ -3228,6 +3690,11 @@ void InterpolateRow_Any_MSA(uint8_t* dst_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); +void InterpolateRow_Any_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); void InterpolateRow_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, @@ -3256,6 +3723,11 @@ void SobelXRow_MSA(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); +void SobelXRow_MMI(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); void SobelYRow_C(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, @@ -3272,6 +3744,10 @@ void SobelYRow_MSA(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); +void SobelYRow_MMI(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); void SobelRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, @@ -3288,6 +3764,10 @@ void SobelRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); +void SobelRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); void SobelToPlaneRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, @@ -3304,6 +3784,10 @@ void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); +void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); void SobelXYRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, @@ -3320,6 +3804,10 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); +void SobelXYRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); void SobelRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3332,6 +3820,10 @@ void SobelRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void SobelRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3344,6 +3836,10 @@ void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void SobelXYRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3356,6 +3852,10 @@ void SobelXYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void SobelXYRow_Any_MMI(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void ARGBPolynomialRow_C(const uint8_t* src_argb, uint8_t* dst_argb, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 7194ba09..3042136d 100644..100755 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -110,6 +110,24 @@ extern "C" { #define HAS_SCALEROWDOWN4_MSA #endif +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +#define HAS_FIXEDDIV1_MIPS +#define HAS_FIXEDDIV_MIPS +#define HAS_SCALEARGBCOLS_MMI +#define HAS_SCALEARGBCOLSUP2_MMI +#define HAS_SCALEARGBFILTERCOLS_MMI +#define HAS_SCALEARGBROWDOWN2_MMI +#define HAS_SCALEARGBROWDOWNEVEN_MMI +#define HAS_SCALEROWDOWN2_MMI +#define HAS_SCALEROWDOWN4_MMI +#define HAS_SCALEADDROW_MMI +#define HAS_SCALEADDROW_16_MMI +#define HAS_SCALEROWDOWN2_16_MMI +#define HAS_SCALEROWDOWN4_16_MMI +#define HAS_SCALECOLS_MMI +#define HAS_SCALECOLS_16_MMI +#endif + // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, @@ -147,12 +165,17 @@ enum FilterMode ScaleFilterReduce(int src_width, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div); int FixedDiv_X86(int num, int div); +int FixedDiv_MIPS(int num, int div); // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div); int FixedDiv1_X86(int num, int div); +int FixedDiv1_MIPS(int num, int div); #ifdef HAS_FIXEDDIV_X86 #define FixedDiv FixedDiv_X86 #define FixedDiv1 FixedDiv1_X86 +#elif defined HAS_FIXEDDIV_MIPS +#define FixedDiv FixedDiv_MIPS +#define FixedDiv1 FixedDiv1_MIPS #else #define FixedDiv FixedDiv_C #define FixedDiv1 FixedDiv1_C @@ -569,6 +592,26 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, int dst_width, int x, int dx); +void ScaleARGBFilterCols_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); // ARGB Row functions void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, @@ -607,6 +650,18 @@ void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); +void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -643,7 +698,18 @@ void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); - +void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -674,6 +740,16 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width); +void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -704,6 +780,16 @@ void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, int src_stepx, uint8_t* dst_ptr, int dst_width); +void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. @@ -936,6 +1022,93 @@ void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); +void ScaleRowDown2_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_16_MMI(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width); +void ScaleColsUp2_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); + +void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_Any_MMI(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); #ifdef __cplusplus } // extern "C" } // namespace libyuv @@ -13,8 +13,11 @@ import("//build/config/mips.gni") declare_args() { libyuv_include_tests = !build_with_chromium libyuv_disable_jpeg = false - libyuv_use_neon = (current_cpu == "arm64" || - (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))) - libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") && - mips_use_msa + libyuv_use_neon = + current_cpu == "arm64" || + (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)) + libyuv_use_msa = + (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa + libyuv_use_mmi = + (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi } @@ -27,8 +27,10 @@ # Link-Time Optimizations. 'use_lto%': 0, 'mips_msa%': 0, # Default to msa off. + 'mips_mmi%': 0, # Default to mmi off. 'build_neon': 0, 'build_msa': 0, + 'build_mmi': 0, 'conditions': [ ['(target_arch == "armv7" or target_arch == "armv7s" or \ (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\ @@ -40,6 +42,11 @@ { 'build_msa': 1, }], + ['(target_arch == "mipsel" or target_arch == "mips64el")\ + and (mips_mmi == 1)', + { + 'build_mmi': 1, + }], ], }, @@ -92,6 +99,11 @@ 'LIBYUV_MSA', ], }], + ['build_mmi != 0', { + 'defines': [ + 'LIBYUV_MMI', + ], + }], ['OS != "ios" and libyuv_disable_jpeg != 1', { 'defines': [ 'HAVE_JPEG' diff --git a/libyuv.gypi b/libyuv.gypi index 9467adfc..9424580a 100644 --- a/libyuv.gypi +++ b/libyuv.gypi @@ -36,6 +36,7 @@ 'source/compare_common.cc', 'source/compare_gcc.cc', 'source/compare_msa.cc', + 'source/compare_mmi.cc', 'source/compare_neon.cc', 'source/compare_neon64.cc', 'source/compare_win.cc', @@ -56,6 +57,7 @@ 'source/rotate_common.cc', 'source/rotate_gcc.cc', 'source/rotate_msa.cc', + 'source/rotate_mmi.cc', 'source/rotate_neon.cc', 'source/rotate_neon64.cc', 'source/rotate_win.cc', @@ -63,6 +65,7 @@ 'source/row_common.cc', 'source/row_gcc.cc', 'source/row_msa.cc', + 'source/row_mmi.cc', 'source/row_neon.cc', 'source/row_neon64.cc', 'source/row_win.cc', @@ -72,6 +75,7 @@ 'source/scale_common.cc', 'source/scale_gcc.cc', 'source/scale_msa.cc', + 'source/scale_mmi.cc', 'source/scale_neon.cc', 'source/scale_neon64.cc', 'source/scale_win.cc', diff --git a/libyuv_test.gyp b/libyuv_test.gyp index 5fe154c6..d517754c 100644 --- a/libyuv_test.gyp +++ b/libyuv_test.gyp @@ -95,6 +95,12 @@ 'LIBYUV_MSA' ], }], + [ '(target_arch == "mipsel" or target_arch == "mips64el") \ + and (mips_mmi == 1)', { + 'defines': [ + 'LIBYUV_MMI' + ], + }], ], # conditions 'defines': [ # Enable the following 3 macros to turn off assembly for specified CPU. diff --git a/source/compare.cc b/source/compare.cc index 50e3abd0..5aa3a4db 100644..100755 --- a/source/compare.cc +++ b/source/compare.cc @@ -154,6 +154,12 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a, HammingDistance = HammingDistance_MSA; } #endif +#if defined(HAS_HAMMINGDISTANCE_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + HammingDistance = HammingDistance_MMI; + } +#endif + #ifdef _OPENMP #pragma omp parallel for reduction(+ : diff) #endif @@ -210,6 +216,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a, SumSquareError = SumSquareError_MSA; } #endif +#if defined(HAS_SUMSQUAREERROR_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SumSquareError = SumSquareError_MMI; + } +#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif diff --git a/source/compare_mmi.cc b/source/compare_mmi.cc new file mode 100644 index 00000000..7018b195 --- /dev/null +++ b/source/compare_mmi.cc @@ -0,0 +1,121 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +// Hakmem method for hamming distance. +uint32_t HammingDistance_MMI(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0; + uint64_t c1 = 0x5555555555555555; + uint64_t c2 = 0x3333333333333333; + uint64_t c3 = 0x0f0f0f0f0f0f0f0f; + uint32_t c4 = 0x01010101; + uint64_t s1 = 1, s2 = 2, s3 = 4; + __asm__ volatile( + "1: \n\t" + "ldc1 %[ta], 0(%[src_a]) \n\t" + "ldc1 %[tb], 0(%[src_b]) \n\t" + "xor %[temp], %[ta], %[tb] \n\t" + "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1 + "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1 + "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1 + "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2) + "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2 + "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2 + "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t + "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4 + "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4) + "and %[temp1], %[temp1], %[c3] \n\t" //&c3 + "dmfc1 $t0, %[temp1] \n\t" + "dsrl32 $t0, $t0, 0 \n\t " + "mul $t0, $t0, %[c4] \n\t" + "dsrl $t0, $t0, 24 \n\t" + "dadd %[diff], %[diff], $t0 \n\t" + "dmfc1 $t0, %[temp1] \n\t" + "mul $t0, $t0, %[c4] \n\t" + "dsrl $t0, $t0, 24 \n\t" + "dadd %[diff], %[diff], $t0 \n\t" + "daddiu %[src_a], %[src_a], 8 \n\t" + "daddiu %[src_b], %[src_b], 8 \n\t" + "addiu %[count], %[count], -8 \n\t" + "bgtz %[count], 1b \n\t" + "nop \n\t" + : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b), + [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp), + [temp1] "+f"(temp1) + : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1), + [s2] "f"(s2), [s3] "f"(s3) + : "memory"); + return diff; +} + +uint32_t SumSquareError_MMI(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + uint32_t sse_hi = 0u, sse_lo = 0u; + + uint64_t src1, src2; + uint64_t diff, diff_hi, diff_lo; + uint64_t sse_sum, sse_tmp; + + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t" + + "1: \n\t" + "ldc1 %[src1], 0x00(%[src_a]) \n\t" + "ldc1 %[src2], 0x00(%[src_b]) \n\t" + "pasubub %[diff], %[src1], %[src2] \n\t" + "punpcklbh %[diff_lo], %[diff], %[mask] \n\t" + "punpckhbh %[diff_hi], %[diff], %[mask] \n\t" + "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t" + "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" + "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t" + "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" + + "daddiu %[src_a], %[src_a], 0x08 \n\t" + "daddiu %[src_b], %[src_b], 0x08 \n\t" + "daddiu %[count], %[count], -0x08 \n\t" + "bnez %[count], 1b \n\t" + + "mfc1 %[sse_lo], %[sse_sum] \n\t" + "mfhc1 %[sse_hi], %[sse_sum] \n\t" + "daddu %[sse], %[sse_hi], %[sse_lo] \n\t" + : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1), + [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi), + [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp), + [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo) + : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count), + [mask] "f"(mask) + : "memory"); + + return sse; +} +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/convert.cc b/source/convert.cc index 375cc732..fbac1371 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -502,6 +502,18 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + YUY2ToUVRow = YUY2ToUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_MMI; + } + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -583,6 +595,16 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + UYVYToYRow = UYVYToYRow_Any_MMI; + UYVYToUVRow = UYVYToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_MMI; + UYVYToUVRow = UYVYToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -679,6 +701,22 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -765,6 +803,22 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BGRAToYRow = BGRAToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_MMI; + } + } +#endif +#if defined(HAS_BGRATOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BGRAToUVRow = BGRAToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); @@ -851,6 +905,22 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ABGRToYRow = ABGRToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_MMI; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ABGRToUVRow = ABGRToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -937,6 +1007,22 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGBAToYRow = RGBAToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_MMI; + } + } +#endif +#if defined(HAS_RGBATOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGBAToUVRow = RGBAToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -967,7 +1053,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, int width, int height) { int y; -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || defined(HAS_RGB24TOYROW_MMI)) void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVRow_C; @@ -1013,6 +1099,17 @@ int RGB24ToI420(const uint8_t* src_rgb24, RGB24ToUVRow = RGB24ToUVRow_MSA; } } +#elif defined(HAS_RGB24TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MMI; + RGB24ToYRow = RGB24ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from RGB24 to ARGB. #else #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -1046,14 +1143,14 @@ int RGB24ToI420(const uint8_t* src_rgb24, #endif { -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || defined(HAS_RGB24TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || defined(HAS_RGB24TOYROW_MMI)) RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -1070,7 +1167,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || defined(HAS_RGB24TOYROW_MMI)) RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else @@ -1079,7 +1176,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || defined(HAS_RGB24TOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1099,7 +1196,7 @@ int RAWToI420(const uint8_t* src_raw, int width, int height) { int y; -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)) void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVRow_C; void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = @@ -1144,6 +1241,17 @@ int RAWToI420(const uint8_t* src_raw, RAWToUVRow = RAWToUVRow_MSA; } } +#elif defined(HAS_RAWTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToUVRow = RAWToUVRow_Any_MMI; + RAWToYRow = RAWToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from RAW to ARGB. #else #if defined(HAS_RAWTOARGBROW_SSSE3) @@ -1177,14 +1285,14 @@ int RAWToI420(const uint8_t* src_raw, #endif { -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); @@ -1201,7 +1309,7 @@ int RAWToI420(const uint8_t* src_raw, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else @@ -1210,7 +1318,7 @@ int RAWToI420(const uint8_t* src_raw, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1230,7 +1338,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, int width, int height) { int y; -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_MMI)) void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB565ToUVRow_C; @@ -1276,6 +1384,17 @@ int RGB565ToI420(const uint8_t* src_rgb565, RGB565ToUVRow = RGB565ToUVRow_MSA; } } +#elif defined(HAS_RGB565TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MMI; + RGB565ToYRow = RGB565ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -1316,13 +1435,13 @@ int RGB565ToI420(const uint8_t* src_rgb565, #endif #endif { -#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_MMI)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); @@ -1339,7 +1458,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_MMI)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else @@ -1348,7 +1467,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1368,7 +1487,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, int width, int height) { int y; -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_MMI)) void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB1555ToUVRow_C; @@ -1415,6 +1534,17 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; } } +#elif defined(HAS_ARGB1555TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -1455,14 +1585,14 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #endif #endif { -#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_MMI)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, @@ -1481,7 +1611,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_MMI)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else @@ -1490,7 +1620,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1510,7 +1640,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, int width, int height) { int y; -#if defined(HAS_ARGB4444TOYROW_NEON) +#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB4444ToUVRow_C; @@ -1548,6 +1678,17 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } } +#elif defined(HAS_ARGB4444TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI; + ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from ARGB4444 to ARGB. #else #if defined(HAS_ARGB4444TOARGBROW_SSE2) @@ -1606,17 +1747,29 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } + } +#endif #endif { -#if !defined(HAS_ARGB4444TOYROW_NEON) +#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_ARGB4444TOYROW_NEON) +#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, @@ -1635,7 +1788,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_ARGB4444TOYROW_NEON) +#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); #else @@ -1644,7 +1797,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_ARGB4444TOYROW_NEON) +#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) free_aligned_buffer_64(row); #endif } diff --git a/source/convert_argb.cc b/source/convert_argb.cc index f2fe474f..dddab611 100644..100755 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -885,6 +885,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -1004,6 +1012,14 @@ int I400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_I400TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I400ToARGBRow = I400ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); @@ -1071,6 +1087,14 @@ int J400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_J400TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + J400ToARGBRow = J400ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + J400ToARGBRow = J400ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; @@ -1201,6 +1225,14 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RGB24ToARGBRow = RGB24ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -1260,6 +1292,14 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToARGBRow = RAWToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RAWToARGBRow = RAWToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -1327,6 +1367,14 @@ int RGB565ToARGB(const uint8_t* src_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RGB565ToARGBRow = RGB565ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -1394,6 +1442,14 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -1461,6 +1517,14 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); diff --git a/source/convert_from.cc b/source/convert_from.cc index 6fa25323..cb5b4b51 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -302,6 +302,14 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToYUY2Row = I422ToYUY2Row_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -381,6 +389,14 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToUYVYRow = I422ToUYVYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -448,6 +464,14 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToUYVYRow = I422ToUYVYRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index c8d91252..1b070c10 100644..100755 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -76,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_MMI; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -108,6 +116,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -208,6 +224,23 @@ int ARGBToI422(const uint8_t* src_argb, } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); @@ -298,6 +331,22 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -330,6 +379,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow_ = MergeUVRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + MergeUVRow_ = MergeUVRow_MMI; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -434,6 +491,23 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif + #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -466,6 +540,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow_ = MergeUVRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + MergeUVRow_ = MergeUVRow_MMI; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -575,6 +657,22 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -607,6 +705,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToYUY2Row = I422ToYUY2Row_MMI; + } + } +#endif { // Allocate a rows of yuv. @@ -712,6 +818,22 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -744,6 +866,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToUYVYRow = I422ToUYVYRow_MMI; + } + } +#endif { // Allocate a rows of yuv. @@ -821,6 +951,14 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -911,6 +1049,14 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB24Row = ARGBToRGB24Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -977,6 +1123,14 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRAWRow = ARGBToRAWRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -1047,6 +1201,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, @@ -1116,6 +1278,14 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -1182,6 +1352,14 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1248,6 +1426,14 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); @@ -1424,6 +1610,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVJRow = ARGBToUVJRow_Any_MSA; @@ -1432,6 +1626,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -1525,6 +1727,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVJRow = ARGBToUVJRow_Any_MSA; @@ -1533,6 +1743,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); @@ -1602,6 +1820,14 @@ int ARGBToJ400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 31e24b67..2283db6a 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -173,6 +173,9 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, if (strcmp(ase, " msa") == 0) { return kCpuHasMSA; } + if (strcmp(ase, " mmi") == 0) { + return kCpuHasMMI; + } return 0; } while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { @@ -185,6 +188,15 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, } return 0; } + } else if(memcmp(cpuinfo_line, "cpu model", 9) == 0) { + char* p = strstr(cpuinfo_line, "Loongson-3"); + if (p) { + fclose(f); + if (strcmp(ase, " mmi") == 0) { + return kCpuHasMMI; + } + return 0; + } } } fclose(f); @@ -232,6 +244,8 @@ static SAFEBUFFERS int GetCpuFlags(void) { #if defined(__mips__) && defined(__linux__) #if defined(__mips_msa) cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); +#elif defined(_MIPS_ARCH_LOONGSON3A) + cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi"); #endif cpu_info |= kCpuHasMIPS; #endif diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 5eae3f76..f4becdfe 100644..100755 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -410,6 +410,14 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitUVRow = SplitUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { // Copy a row of UV. @@ -478,6 +486,14 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow = MergeUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MergeUVRow = MergeUVRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -537,6 +553,14 @@ void SplitRGBPlane(const uint8_t* src_rgb, } } #endif +#if defined(HAS_SPLITRGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitRGBRow = SplitRGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + SplitRGBRow = SplitRGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { // Copy a row of RGB. @@ -593,6 +617,14 @@ void MergeRGBPlane(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGERGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeRGBRow = MergeRGBRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MergeRGBRow = MergeRGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of RGB. @@ -651,6 +683,14 @@ void MirrorPlane(const uint8_t* src_y, } } #endif +#if defined(HAS_MIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MirrorRow = MirrorRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MirrorRow = MirrorRow_MMI; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -734,6 +774,16 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + YUY2ToUV422Row = YUY2ToUV422Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -820,6 +870,16 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + UYVYToYRow = UYVYToYRow_Any_MMI; + UYVYToUV422Row = UYVYToUV422Row_Any_MMI; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_MMI; + UYVYToUV422Row = UYVYToUV422Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -890,6 +950,14 @@ int YUY2ToY(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToYRow(src_yuy2, dst_y, width); @@ -1015,6 +1083,14 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBMirrorRow = ARGBMirrorRow_MMI; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -1048,6 +1124,11 @@ ARGBBlendRow GetARGBBlend() { ARGBBlendRow = ARGBBlendRow_MSA; } #endif +#if defined(HAS_ARGBBLENDROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBBlendRow = ARGBBlendRow_MMI; + } +#endif return ARGBBlendRow; } @@ -1140,6 +1221,14 @@ int BlendPlane(const uint8_t* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BlendPlaneRow = BlendPlaneRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); @@ -1216,6 +1305,14 @@ int I420Blend(const uint8_t* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BlendPlaneRow = BlendPlaneRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + BlendPlaneRow = BlendPlaneRow_MMI; + } + } +#endif if (!IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_C; } @@ -1252,6 +1349,17 @@ int I420Blend(const uint8_t* src_y0, } } #endif +#if defined(HAS_SCALEROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + ScaleRowDown2 = ScaleRowDown2Box_MMI; + } + } + } +#endif // Row buffer for intermediate alpha pixels. align_buffer_64(halfalpha, halfwidth); @@ -1337,6 +1445,14 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBMultiplyRow = ARGBMultiplyRow_MMI; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -1414,6 +1530,14 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAddRow = ARGBAddRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAddRow = ARGBAddRow_MMI; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -1486,6 +1610,14 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBSubtractRow = ARGBSubtractRow_MMI; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { @@ -1718,6 +1850,14 @@ int RAWToRGB24(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RAWToRGB24Row = RAWToRGB24Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -1939,6 +2079,14 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -2039,6 +2187,11 @@ int ARGBGrayTo(const uint8_t* src_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBGrayRow = ARGBGrayRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -2084,6 +2237,11 @@ int ARGBGray(uint8_t* dst_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBGrayRow = ARGBGrayRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); @@ -2127,6 +2285,11 @@ int ARGBSepia(uint8_t* dst_argb, ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif +#if defined(HAS_ARGBSEPIAROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBSepiaRow = ARGBSepiaRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); @@ -2178,6 +2341,11 @@ int ARGBColorMatrix(const uint8_t* src_argb, ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; } #endif +#if defined(HAS_ARGBCOLORMATRIXROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); src_argb += src_stride_argb; @@ -2372,6 +2540,12 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb, ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; } #endif +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. for (y = 0; y < height; ++y) { ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); @@ -2430,6 +2604,11 @@ int ARGBBlur(const uint8_t* src_argb, CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; + } +#endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, @@ -2536,6 +2715,11 @@ int ARGBShade(const uint8_t* src_argb, ARGBShadeRow = ARGBShadeRow_MSA; } #endif +#if defined(HAS_ARGBSHADEROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBShadeRow = ARGBShadeRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -2607,6 +2791,14 @@ int InterpolatePlane(const uint8_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -2730,6 +2922,14 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBShuffleRow = ARGBShuffleRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); @@ -2801,6 +3001,14 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -2817,6 +3025,11 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelYRow = SobelYRow_MSA; } #endif +#if defined(HAS_SOBELYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelYRow = SobelYRow_MMI; + } +#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -2832,6 +3045,11 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelXRow = SobelXRow_MSA; } #endif +#if defined(HAS_SOBELXROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelXRow = SobelXRow_MMI; + } +#endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; @@ -2914,6 +3132,14 @@ int ARGBSobel(const uint8_t* src_argb, } } #endif +#if defined(HAS_SOBELROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelRow = SobelRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_MMI; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); } @@ -2952,6 +3178,14 @@ int ARGBSobelToPlane(const uint8_t* src_argb, } } #endif +#if defined(HAS_SOBELTOPLANEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SobelToPlaneRow = SobelToPlaneRow_MMI; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, height, SobelToPlaneRow); } @@ -2991,6 +3225,14 @@ int ARGBSobelXY(const uint8_t* src_argb, } } #endif +#if defined(HAS_SOBELXYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelXYRow = SobelXYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_MMI; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); } @@ -3228,6 +3470,14 @@ int ARGBCopyAlpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBCOPYALPHAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBCopyAlphaRow(src_argb, dst_argb, width); @@ -3286,6 +3536,12 @@ int ARGBExtractAlpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_MSA; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI + : ARGBExtractAlphaRow_Any_MMI; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); @@ -3337,6 +3593,14 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBCopyYToAlphaRow(src_y, dst_argb, width); @@ -3406,6 +3670,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_SPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitUVRow = SplitUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_MMI; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3438,6 +3710,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif { int awidth = halfwidth * 2; @@ -3522,6 +3802,14 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitUVRow = SplitUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_MMI; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3554,6 +3842,14 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif { int awidth = halfwidth * 2; diff --git a/source/rotate.cc b/source/rotate.cc index f2bed85b..f28a06d3 100644..100755 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -49,6 +49,11 @@ void TransposePlane(const uint8_t* src, } } #endif +#if defined(HAS_TRANSPOSEWX8_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + TransposeWx8 = TransposeWx8_MMI; + } +#endif #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; @@ -166,6 +171,14 @@ void RotatePlane180(const uint8_t* src, } } #endif +#if defined(HAS_MIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MirrorRow = MirrorRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MirrorRow = MirrorRow_MMI; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -186,6 +199,11 @@ void RotatePlane180(const uint8_t* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { @@ -232,6 +250,14 @@ void TransposeUV(const uint8_t* src, } } #endif +#if defined(HAS_TRANSPOSEUVWX8_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + TransposeUVWx8 = TransposeUVWx8_Any_MMI; + if (IS_ALIGNED(width, 4)) { + TransposeUVWx8 = TransposeUVWx8_MMI; + } + } +#endif #if defined(HAS_TRANSPOSEUVWX16_MSA) if (TestCpuFlag(kCpuHasMSA)) { TransposeUVWx16 = TransposeUVWx16_Any_MSA; @@ -331,6 +357,11 @@ void RotateUV180(const uint8_t* src, MirrorUVRow = MirrorUVRow_MSA; } #endif +#if defined(HAS_MIRRORUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_MMI; + } +#endif dst_a += dst_stride_a * (height - 1); dst_b += dst_stride_b * (height - 1); diff --git a/source/rotate_any.cc b/source/rotate_any.cc index c2752e62..b3baf084 100644..100755 --- a/source/rotate_any.cc +++ b/source/rotate_any.cc @@ -35,6 +35,9 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) #ifdef HAS_TRANSPOSEWX8_SSSE3 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #endif +#ifdef HAS_TRANSPOSEWX8_MMI +TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7) +#endif #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif @@ -62,6 +65,9 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #ifdef HAS_TRANSPOSEUVWX8_SSE2 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #endif +#ifdef HAS_TRANSPOSEUVWX8_MMI +TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7) +#endif #ifdef HAS_TRANSPOSEUVWX16_MSA TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) #endif diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 5a6e0537..a93fd55f 100644..100755 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -56,6 +56,14 @@ static void ARGBTranspose(const uint8_t* src_argb, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI; + } + } +#endif for (i = 0; i < width; ++i) { // column of source to row of dest. ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); @@ -142,6 +150,14 @@ void ARGBRotate180(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBMirrorRow = ARGBMirrorRow_MMI; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; diff --git a/source/rotate_mmi.cc b/source/rotate_mmi.cc new file mode 100644 index 00000000..435fda11 --- /dev/null +++ b/source/rotate_mmi.cc @@ -0,0 +1,290 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +void TransposeWx8_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; + uint8_t* src_tmp = nullptr; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[tmp12], 0x00(%[src]) \n\t" + "dadd %[src_tmp], %[src], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (00 10 01 11 02 12 03 13) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (04 14 05 15 06 16 07 17) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (20 30 21 31 22 32 23 33) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (24 34 25 35 26 36 27 37) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp4 = (00 10 20 30 01 11 21 31) */ + "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" + /* tmp5 = (02 12 22 32 03 13 23 33) */ + "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" + /* tmp6 = (04 14 24 34 05 15 25 35) */ + "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" + /* tmp7 = (06 16 26 36 07 17 27 37) */ + "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (40 50 41 51 42 52 43 53) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (44 54 45 55 46 56 47 57) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (60 70 61 71 62 72 63 73) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (64 74 65 75 66 76 67 77) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp8 = (40 50 60 70 41 51 61 71) */ + "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" + /* tmp9 = (42 52 62 72 43 53 63 73) */ + "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" + /* tmp10 = (44 54 64 74 45 55 65 75) */ + "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" + /* tmp11 = (46 56 66 76 47 57 67 77) */ + "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" + + /* tmp0 = (00 10 20 30 40 50 60 70) */ + "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" + /* tmp1 = (01 11 21 31 41 51 61 71) */ + "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + /* tmp0 = (02 12 22 32 42 52 62 72) */ + "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" + /* tmp1 = (03 13 23 33 43 53 63 73) */ + "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + /* tmp0 = (04 14 24 34 44 54 64 74) */ + "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" + /* tmp1 = (05 15 25 35 45 55 65 75) */ + "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + /* tmp0 = (06 16 26 36 46 56 66 76) */ + "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" + /* tmp1 = (07 17 27 37 47 57 67 77) */ + "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "daddi %[src], %[src], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + + : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), + [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), + [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), + [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), + [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst), + [src_tmp] "+&r"(src_tmp) + : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride), + [dst_stride] "r"(dst_stride) + : "memory"); +} + +void TransposeUVWx8_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; + uint8_t* src_tmp = nullptr; + + __asm__ volatile( + "1: \n\t" + /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */ + "ldc1 %[tmp12], 0x00(%[src]) \n\t" + "dadd %[src_tmp], %[src], %[src_stride] \n\t" + /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */ + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */ + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */ + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */ + "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" + /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */ + "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" + /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */ + "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" + /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */ + "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */ + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */ + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */ + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */ + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */ + "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" + /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */ + "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" + /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */ + "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" + /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */ + "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" + + /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */ + "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" + /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */ + "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */ + "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" + /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */ + "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */ + "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" + /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */ + "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */ + "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" + /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */ + "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "daddiu %[src], %[src], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), + [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), + [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), + [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), + [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a), + [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp) + : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a), + [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride) + : "memory"); +} + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_any.cc b/source/row_any.cc index e91560c4..031a8f64 100644..100755 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -92,6 +92,9 @@ ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) #ifdef HAS_MERGERGBROW_NEON ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #endif +#ifdef HAS_MERGERGBROW_MMI +ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) +#endif #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) @@ -106,18 +109,27 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOYUY2ROW_MSA ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOYUY2ROW_MMI +ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif #ifdef HAS_I422TOUYVYROW_MSA ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOUYVYROW_MMI +ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif #ifdef HAS_BLENDPLANEROW_SSSE3 ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) #endif +#ifdef HAS_BLENDPLANEROW_MMI +ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7) +#endif #undef ANY31 // Note that odd width replication includes 444 due to implementation @@ -271,6 +283,9 @@ ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #ifdef HAS_MERGEUVROW_MSA ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) #endif +#ifdef HAS_MERGEUVROW_MMI +ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) +#endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 @@ -303,12 +318,21 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBMULTIPLYROW_MSA ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #endif +#ifdef HAS_ARGBMULTIPLYROW_MMI +ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1) +#endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBADDROW_MMI +ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1) +#endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBSUBTRACTROW_MMI +ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif @@ -318,6 +342,9 @@ ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #ifdef HAS_SOBELROW_MSA ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) #endif +#ifdef HAS_SOBELROW_MMI +ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7) +#endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif @@ -327,6 +354,9 @@ ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #ifdef HAS_SOBELTOPLANEROW_MSA ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) #endif +#ifdef HAS_SOBELTOPLANEROW_MMI +ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7) +#endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif @@ -336,6 +366,9 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #ifdef HAS_SOBELXYROW_MSA ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) #endif +#ifdef HAS_SOBELXYROW_MMI +ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7) +#endif #undef ANY21 // Any 2 planes to 1 with yuvconstants @@ -521,12 +554,24 @@ ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) #endif +#if defined(HAS_ARGBTORGB24ROW_MMI) +ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) +ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3) +ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) +ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) +ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) +ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) +ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif #if defined(HAS_RAWTORGB24ROW_MSA) ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) #endif +#if defined(HAS_RAWTORGB24ROW_MMI) +ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) +#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif @@ -558,57 +603,87 @@ ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_MMI +ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ARGBTOYJROW_MSA ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYJROW_MMI +ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_BGRATOYROW_MSA ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_BGRATOYROW_MMI +ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ABGRTOYROW_MSA ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) #endif +#ifdef HAS_ABGRTOYROW_MMI +ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_RGBATOYROW_MSA ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYROW_MMI +ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RGB24TOYROW_MSA ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #endif +#ifdef HAS_RGB24TOYROW_MMI +ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7) +#endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RAWTOYROW_MSA ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) #endif +#ifdef HAS_RAWTOYROW_MMI +ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_RGB565TOYROW_MSA ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) #endif +#ifdef HAS_RGB565TOYROW_MMI +ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7) +#endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_ARGB1555TOYROW_MSA ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) #endif +#ifdef HAS_ARGB1555TOYROW_MMI +ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7) +#endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_ARGB4444TOYROW_MMI +ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7) +#endif #ifdef HAS_YUY2TOYROW_NEON ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif @@ -618,39 +693,60 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOYROW_MMI +ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7) +#endif #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_UYVYTOYROW_MMI +ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) +#endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RGB24TOARGBROW_MSA ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) #endif +#ifdef HAS_RGB24TOARGBROW_MMI +ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3) +#endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RAWTOARGBROW_MSA ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) #endif +#ifdef HAS_RAWTOARGBROW_MMI +ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3) +#endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_RGB565TOARGBROW_MSA ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) #endif +#ifdef HAS_RGB565TOARGBROW_MMI +ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3) +#endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB1555TOARGBROW_MSA ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) #endif +#ifdef HAS_ARGB1555TOARGBROW_MMI +ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3) +#endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB4444TOARGBROW_MSA ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) #endif +#ifdef HAS_ARGB4444TOARGBROW_MMI +ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif @@ -669,6 +765,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_MSA ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_MMI +ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif @@ -681,6 +780,9 @@ ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #ifdef HAS_ARGBEXTRACTALPHAROW_MSA ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MMI +ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7) +#endif #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. @@ -705,12 +807,18 @@ ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) #ifdef HAS_ARGBCOPYALPHAROW_SSE2 ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBCOPYALPHAROW_MMI +ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1) +#endif #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) #endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI +ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) +#endif #undef ANY11B // Any 1 to 1 with parameter. @@ -760,6 +868,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA, 2, 7) #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MMI) +ANY11P(ARGBToRGB565DitherRow_Any_MMI, + ARGBToRGB565DitherRow_MMI, + const uint32_t, + 4, + 2, + 3) +#endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #endif @@ -772,6 +888,10 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #ifdef HAS_ARGBSHUFFLEROW_MSA ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif +#ifdef HAS_ARGBSHUFFLEROW_MMI +ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) +#endif +#undef ANY11P #undef ANY11P // Any 1 to 1 with parameter and shorts. BPP measures in shorts. @@ -940,6 +1060,9 @@ ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #ifdef HAS_INTERPOLATEROW_MSA ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif +#ifdef HAS_INTERPOLATEROW_MMI +ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) +#endif #undef ANY11T // Any 1 to 1 mirror. @@ -969,6 +1092,9 @@ ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #endif +#ifdef HAS_MIRRORROW_MMI +ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -981,6 +1107,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif +#ifdef HAS_ARGBMIRRORROW_MMI +ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) +#endif #undef ANY11M // Any 1 plane. (memset) @@ -1039,6 +1168,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #ifdef HAS_SPLITUVROW_MSA ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) #endif +#ifdef HAS_SPLITUVROW_MMI +ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7) +#endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) #endif @@ -1060,6 +1192,11 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOUV422ROW_MMI +ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7) +ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15) +ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) +#endif #undef ANY12 // Any 1 to 3. Outputs RGB planes. @@ -1086,6 +1223,9 @@ ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) #ifdef HAS_SPLITRGBROW_NEON ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #endif +#ifdef HAS_SPLITRGBROW_MMI +ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) +#endif // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. @@ -1140,57 +1280,87 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVROW_MMI +ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVJROW_MMI +ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) +#endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MSA ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_BGRATOUVROW_MMI +ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MSA ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVROW_MMI +ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MSA ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_RGBATOUVROW_MMI +ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif #ifdef HAS_RGB24TOUVROW_MSA ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVROW_MMI +ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15) +#endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif #ifdef HAS_RAWTOUVROW_MSA ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVROW_MMI +ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15) +#endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_RGB565TOUVROW_MSA ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) #endif +#ifdef HAS_RGB565TOUVROW_MMI +ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15) +#endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_ARGB1555TOUVROW_MSA ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) #endif +#ifdef HAS_ARGB1555TOUVROW_MMI +ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15) +#endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_ARGB4444TOUVROW_MMI +ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15) +#endif #ifdef HAS_YUY2TOUVROW_NEON ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #endif @@ -1200,9 +1370,15 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #ifdef HAS_YUY2TOUVROW_MSA ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_YUY2TOUVROW_MMI +ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15) +#endif #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_UYVYTOUVROW_MMI +ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) +#endif #undef ANY12S #ifdef __cplusplus diff --git a/source/row_mmi.cc b/source/row_mmi.cc new file mode 100644 index 00000000..6c288597 --- /dev/null +++ b/source/row_mmi.cc @@ -0,0 +1,5972 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <sys/time.h> +#include "libyuv/row.h" + +#include <stdio.h> +#include <string.h> // For memcpy and memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest; + const uint64_t mask = 0xff000000ULL; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask] \n\t" + "or %[src1], %[src1], %[mask] \n\t" + "punpcklwd %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask] \n\t" + "or %[src1], %[src1], %[mask] \n\t" + "punpcklwd %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width), + [mask] "f"(mask) + : "memory"); +} + +void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + uint64_t src0, src1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0xff000000ULL; + const uint64_t mask2 = 0xc6; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask1] \n\t" + "punpcklbh %[src0], %[src0], %[mask0] \n\t" + "pshufh %[src0], %[src0], %[mask2] \n\t" + "or %[src1], %[src1], %[mask1] \n\t" + "punpcklbh %[src1], %[src1], %[mask0] \n\t" + "pshufh %[src1], %[src1], %[mask2] \n\t" + "packushb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask1] \n\t" + "punpcklbh %[src0], %[src0], %[mask0] \n\t" + "pshufh %[src0], %[src0], %[mask2] \n\t" + "or %[src1], %[src1], %[mask1] \n\t" + "punpcklbh %[src1], %[src1], %[mask0] \n\t" + "pshufh %[src1], %[src1], %[mask2] \n\t" + "packushb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width) + : "memory"); +} + +void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + uint64_t src0, src1; + uint64_t ftmp[4]; + uint64_t mask0 = 0xc6; + uint64_t mask1 = 0x6c; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t" + "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t" + "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t" + + "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" + "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" + "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" + "punpcklbh %[src1], %[src1], %[zero] \n\t" + "pextrh %[ftmp2], %[ftmp0], %[three] \n\t" + "pextrh %[ftmp3], %[ftmp1], %[one] \n\t" + "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "pextrh %[ftmp3], %[ftmp1], %[two] \n\t" + "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pshufh %[src1], %[src1], %[mask1] \n\t" + "pextrh %[ftmp2], %[src1], %[zero] \n\t" + "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[src1], %[src1], %[zero] \n\t" + + "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t" + "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t" + "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t" + + "daddiu %[src_raw], %[src_raw], 0x0c \n\t" + "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), + [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]) + : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width), + [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), + [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03) + : "memory"); +} + +void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + uint64_t ftmp[5]; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0007000700070007; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psrlh %[r], %[src1], %[three] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[two] \n\t" + "psrlh %[src1], %[g], %[four] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "packushb %[b], %[b], %[r] \n\t" + "packushb %[g], %[g], %[c1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" + "punpckhhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" + "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t" + "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), + [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]) + : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), + [four] "f"(0x04) + : "memory"); +} + +void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + uint64_t ftmp[6]; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0003000300030003; + uint64_t c3 = 0x007c007c007c007c; + uint64_t c4 = 0x0001000100010001; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "and %[r], %[src1], %[c3] \n\t" + "psrlh %[r], %[r], %[two] \n\t" + "psrlh %[a], %[src1], %[seven] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[three] \n\t" + "psrlh %[src1], %[g], %[two] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "xor %[a], %[a], %[c1] \n\t" + "paddb %[a], %[a], %[c4] \n\t" + "packushb %[b], %[b], %[r] \n\t" + "packushb %[g], %[g], %[a] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" + "punpckhhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" + "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t" + "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), + [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) + : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05), + [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) + : "memory"); +} + +void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + uint64_t ftmp[6]; + uint64_t c0 = 0x000f000f000f000f; + uint64_t c1 = 0x00ff00ff00ff00ff; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g], %[src0], %[four] \n\t" + "and %[r], %[src1], %[c0] \n\t" + "psrlh %[a], %[src1], %[four] \n\t" + "psllh %[src0], %[b], %[four] \n\t" + "or %[b], %[src0], %[b] \n\t" + "psllh %[src0], %[g], %[four] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psllh %[src0], %[r], %[four] \n\t" + "or %[r], %[src0], %[r] \n\t" + "psllh %[src0], %[a], %[four] \n\t" + "or %[a], %[src0], %[a] \n\t" + "packushb %[b], %[b], %[r] \n\t" + "packushb %[g], %[g], %[a] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" + "punpckhhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" + "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t" + "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), + [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) + : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), + [four] "f"(0x04) + : "memory"); +} + +void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + uint64_t src; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t" + + "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t" + + "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t" + + "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width) + : "memory"); +} + +void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + uint64_t src0, src1; + uint64_t ftmp[3]; + uint64_t mask0 = 0xc6; + uint64_t mask1 = 0x18; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" + "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" + "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" + "punpcklbh %[ftmp2], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + + "pextrh %[src0], %[ftmp1], %[two] \n\t" + "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[one] \n\t" + + "pextrh %[src0], %[ftmp2], %[two] \n\t" + "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t" + "pextrh %[src0], %[ftmp2], %[one] \n\t" + "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t" + "pextrh %[src0], %[ftmp2], %[zero] \n\t" + "pshufh %[src1], %[src1], %[mask1] \n\t" + "pinsrh_0 %[src1], %[src1], %[src0] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[src1], %[src1], %[zero] \n\t" + + "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t" + "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t" + "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), + [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), + [one] "f"(0x01), [two] "f"(0x02) + : "memory"); +} + +void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + uint64_t src0, src1; + uint64_t ftmp[3]; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + + "psrlh %[b], %[b], %[three] \n\t" + "psrlh %[g], %[g], %[two] \n\t" + "psrlh %[r], %[r], %[three] \n\t" + + "psllh %[g], %[g], %[five] \n\t" + "psllh %[r], %[r], %[eleven] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), + [eleven] "f"(0x0b) + : "memory"); +} + +// dither4 is a row of 4 values from 4x4 dither matrix. +// The 4x4 matrix contains values to increase RGB. When converting to +// fewer bits (565) this provides an ordered dither. +// The order in the 4x4 matrix in first byte is upper left. +// The 4 values are passed as an int, then referenced as an array, so +// endian will not affect order of the original matrix. But the dither4 +// will containing the first pixel in the lower byte for little endian +// or the upper byte for big endian. +void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + uint64_t src0, src1; + uint64_t ftmp[3]; + uint64_t c0 = 0x00ff00ff00ff00ff; + + __asm__ volatile( + "punpcklbh %[dither], %[dither], %[zero] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + + "paddh %[b], %[b], %[dither] \n\t" + "paddh %[g], %[g], %[dither] \n\t" + "paddh %[r], %[r], %[dither] \n\t" + "pcmpgth %[src0], %[b], %[c0] \n\t" + "or %[src0], %[src0], %[b] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "pcmpgth %[src0], %[g], %[c0] \n\t" + "or %[src0], %[src0], %[g] \n\t" + "and %[g], %[src0], %[c0] \n\t" + "pcmpgth %[src0], %[r], %[c0] \n\t" + "or %[src0], %[src0], %[r] \n\t" + "and %[r], %[src0], %[c0] \n\t" + + "psrlh %[b], %[b], %[three] \n\t" + "psrlh %[g], %[g], %[two] \n\t" + "psrlh %[r], %[r], %[three] \n\t" + + "psllh %[g], %[g], %[five] \n\t" + "psllh %[r], %[r], %[eleven] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02), + [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) + : "memory"); +} + +void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + uint64_t src0, src1; + uint64_t ftmp[4]; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + "punpckhbh %[a], %[src1], %[zero] \n\t" + + "psrlh %[b], %[b], %[three] \n\t" + "psrlh %[g], %[g], %[three] \n\t" + "psrlh %[r], %[r], %[three] \n\t" + "psrlh %[a], %[a], %[seven] \n\t" + + "psllh %[g], %[g], %[five] \n\t" + "psllh %[r], %[r], %[ten] \n\t" + "psllh %[a], %[a], %[fifteen] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + "or %[b], %[b], %[a] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05), + [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f) + : "memory"); +} + +void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + uint64_t src0, src1; + uint64_t ftmp[4]; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + "punpckhbh %[a], %[src1], %[zero] \n\t" + + "psrlh %[b], %[b], %[four] \n\t" + "psrlh %[g], %[g], %[four] \n\t" + "psrlh %[r], %[r], %[four] \n\t" + "psrlh %[a], %[a], %[four] \n\t" + + "psllh %[g], %[g], %[four] \n\t" + "psllh %[r], %[r], %[eight] \n\t" + "psllh %[a], %[a], %[twelve] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + "or %[b], %[b], %[a] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08), + [twelve] "f"(0x0c) + : "memory"); +} + +void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001004200810019; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void ARGBToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0019008100420001; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void BGRAToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x00020070004a0026; + const uint64_t mask_v = 0x0012005e00700002; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src1], %[src0] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src0], %[src1] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src1], %[src0] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src0], %[src1] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src1], %[src0] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src0], %[src1] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src1], %[src0] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src0], %[src1] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001001900810042; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void ABGRToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x00020070004a0026; + const uint64_t mask_v = 0x0012005e00700002; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" + "dsll %[dest0_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src1], %[src0] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src0], %[src1] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" + "dsll %[dest1_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src1], %[src0] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src0], %[src1] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" + "dsll %[dest2_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src1], %[src0] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src0], %[src1] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" + "dsll %[dest3_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src1], %[src0] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src0], %[src1] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0042008100190001; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void RGBAToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" + "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" + "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" + "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" + "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001004200810019; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001001900810042; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void RAWToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x00020070004a0026; + const uint64_t mask_v = 0x0012005e00700002; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" + "dsll %[dest0_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src1], %[src0] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src0], %[src1] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" + "dsll %[dest1_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src1], %[src0] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src0], %[src1] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" + "dsll %[dest2_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src1], %[src0] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src0], %[src1] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" + "dsll %[dest3_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src1], %[src0] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src0], %[src1] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest, dest0, dest1, dest2, dest3; + uint64_t tmp0, tmp1; + const uint64_t shift = 0x07; + const uint64_t value = 0x0040; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x00010026004B000FULL; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + + "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest2], %[dest2], %[shift] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest3], %[dest3], %[shift] \n\t" + + "packsswh %[tmp0], %[dest0], %[dest1] \n\t" + "packsswh %[tmp1], %[dest2], %[dest3] \n\t" + "packushb %[dest], %[tmp0], %[tmp1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), + [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), + [tmp1] "=&f"(tmp1) + : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), + [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), + [width] "r"(width) + : "memory"); +} + +void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x002b0054007f0002; + const uint64_t mask_v = 0x0002007f006b0014; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + uint64_t ftmp[11]; + const uint64_t value = 0x1080108010801080; + const uint64_t mask = 0x0001004200810019; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0007000700070007; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psrlh %[r], %[src1], %[three] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[two] \n\t" + "psrlh %[src1], %[g], %[four] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[src0], %[src1] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[src0], %[src1] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psrlh %[r], %[src1], %[three] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[two] \n\t" + "psrlh %[src1], %[g], %[four] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[src0], %[src1] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[src0], %[src1] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddiu %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), + [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), + [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) + : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05), + [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) + : "memory"); +} + +void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + uint64_t ftmp[11]; + const uint64_t value = 0x1080108010801080; + const uint64_t mask = 0x0001004200810019; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0003000300030003; + uint64_t c3 = 0x007c007c007c007c; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "and %[r], %[src1], %[c3] \n\t" + "psrlh %[r], %[r], %[two] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[three] \n\t" + "psrlh %[src1], %[g], %[two] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[src0], %[src1] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[src0], %[src1] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "and %[r], %[src1], %[c3] \n\t" + "psrlh %[r], %[r], %[two] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[three] \n\t" + "psrlh %[src1], %[g], %[two] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[src0], %[src1] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[src0], %[src1] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddiu %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), + [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), + [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) + : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y), + [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), + [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08), + [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) + : "memory"); +} + +void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + uint64_t ftmp[11]; + uint64_t value = 0x1080108010801080; + uint64_t mask = 0x0001004200810019; + uint64_t c0 = 0x000f000f000f000f; + uint64_t c1 = 0x00ff00ff00ff00ff; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g], %[src0], %[four] \n\t" + "and %[r], %[src1], %[c0] \n\t" + "psllh %[src0], %[b], %[four] \n\t" + "or %[b], %[src0], %[b] \n\t" + "psllh %[src0], %[g], %[four] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psllh %[src0], %[r], %[four] \n\t" + "or %[r], %[src0], %[r] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[src0], %[src1] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[src0], %[src1] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g], %[src0], %[four] \n\t" + "and %[r], %[src1], %[c0] \n\t" + "psllh %[src0], %[b], %[four] \n\t" + "or %[b], %[src0], %[b] \n\t" + "psllh %[src0], %[g], %[four] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psllh %[src0], %[r], %[four] \n\t" + "or %[r], %[src0], %[r] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[src0], %[src1] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[src0], %[src1] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddiu %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), + [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), + [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) + : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y), + [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), + [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) + : "memory"); +} + +void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[13]; + uint64_t value = 0x2020202020202020; + uint64_t mask_u = 0x0026004a00700002; + uint64_t mask_v = 0x00020070005e0012; + uint64_t mask = 0x93; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0007000700070007; + __asm__ volatile( + "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t" + "psrlh %[dest0_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest0_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest0_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest0_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest0_v], %[src0], %[c2] \n\t" + "psllh %[dest0_v], %[dest0_v], %[three] \n\t" + "or %[dest0_v], %[src1], %[dest0_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest0_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest0_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t" + "psrlh %[dest1_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest1_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest1_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest1_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest1_v], %[src0], %[c2] \n\t" + "psllh %[dest1_v], %[dest1_v], %[three] \n\t" + "or %[dest1_v], %[src1], %[dest1_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest1_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest1_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t" + "psrlh %[dest2_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest2_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest2_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest2_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest2_v], %[src0], %[c2] \n\t" + "psllh %[dest2_v], %[dest2_v], %[three] \n\t" + "or %[dest2_v], %[src1], %[dest2_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest2_u] \n\t" + "paddh %[g0], %[g0], %[dest2_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest2_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t" + "psrlh %[dest3_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest3_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest3_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest3_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest3_v], %[src0], %[c2] \n\t" + "psllh %[dest3_v], %[dest3_v], %[three] \n\t" + "or %[dest3_v], %[src1], %[dest3_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest3_u] \n\t" + "paddh %[g0], %[g0], %[dest3_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest3_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t" + "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddiu %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), + [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), + [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), + [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), + [dest3_v] "=&f"(ftmp[12]) + : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), + [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), + [one] "f"(0x01) + : "memory"); +} + +void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[11]; + uint64_t value = 0x2020202020202020; + uint64_t mask_u = 0x0026004a00700002; + uint64_t mask_v = 0x00020070005e0012; + uint64_t mask = 0x93; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0003000300030003; + uint64_t c3 = 0x007c007c007c007c; + __asm__ volatile( + "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t" + "psrlh %[dest0_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest0_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest0_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest0_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest0_v], %[src0], %[c2] \n\t" + "psllh %[dest0_v], %[dest0_v], %[three] \n\t" + "or %[dest0_v], %[src1], %[dest0_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest0_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest0_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t" + "psrlh %[dest1_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest1_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest1_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest1_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest1_v], %[src0], %[c2] \n\t" + "psllh %[dest1_v], %[dest1_v], %[three] \n\t" + "or %[dest1_v], %[src1], %[dest1_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest1_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest1_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t" + "psrlh %[dest2_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest2_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest2_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest2_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest0_v], %[src0], %[c2] \n\t" + "psllh %[dest0_v], %[dest0_v], %[three] \n\t" + "or %[dest0_v], %[src1], %[dest0_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest2_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest2_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest2_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest2_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t" + "psrlh %[dest3_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest3_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest3_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest3_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest1_v], %[src0], %[c2] \n\t" + "psllh %[dest1_v], %[dest1_v], %[three] \n\t" + "or %[dest1_v], %[src1], %[dest1_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest3_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest3_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest3_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest3_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[dest0_u], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t" + "packushb %[dest0_v], %[dest1_u], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t" + "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddiu %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), + [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), + [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), + [dest1_v] "=&f"(ftmp[10]) + : [src_argb1555] "r"(src_argb1555), + [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u), + [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), + [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), + [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), + [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), + [two] "f"(0x02), [one] "f"(0x01) + : "memory"); +} + +void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[13]; + uint64_t value = 0x2020202020202020; + uint64_t mask_u = 0x0026004a00700002; + uint64_t mask_v = 0x00020070005e0012; + uint64_t mask = 0x93; + uint64_t c0 = 0x000f000f000f000f; + uint64_t c1 = 0x00ff00ff00ff00ff; + __asm__ volatile( + "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t" + "psrlh %[dest0_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest0_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest0_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest0_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest0_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest0_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t" + "psrlh %[dest1_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest1_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest1_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest1_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest1_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest1_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t" + "psrlh %[dest2_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest2_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest2_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest2_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest2_u] \n\t" + "paddh %[g0], %[g0], %[dest2_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest2_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t" + "psrlh %[dest3_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest3_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest3_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest3_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest3_u] \n\t" + "paddh %[g0], %[g0], %[dest3_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest3_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t" + "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddiu %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), + [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), + [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), + [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), + [dest3_v] "=&f"(ftmp[12]) + : [src_argb4444] "r"(src_argb4444), + [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u), + [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), + [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u), + [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04), + [two] "f"(0x02) + : "memory"); +} + +void ARGBToUV444Row_MMI(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), + [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), + [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), + [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), + [dest3_v] "=&f"(ftmp[11]) + : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), + [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10), + [eight] "f"(0x08) + : "memory"); +} + +void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi; + uint64_t tmp0, tmp1; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x01; + const uint64_t mask2 = 0x00400026004B000FULL; + const uint64_t mask3 = 0xFF000000FF000000ULL; + const uint64_t mask4 = ~mask3; + const uint64_t shift = 0x07; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "and %[src37], %[src], %[mask3] \n\t" + + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t" + "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t" + "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t" + "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t" + + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t" + "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t" + "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t" + "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t" + "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "and %[dest], %[dest], %[mask4] \n\t" + "or %[dest], %[dest], %[src37] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0), + [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest), + [src37] "=&f"(src37) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), + [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4) + : "memory"); +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) { + uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2; + uint64_t tmp0, tmp1; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x002300440011ULL; + const uint64_t mask2 = 0x002D00580016ULL; + const uint64_t mask3 = 0x003200620018ULL; + const uint64_t mask4 = 0xFF000000FF000000ULL; + const uint64_t shift = 0x07; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "and %[dest37], %[dest], %[mask4] \n\t" + + "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t" + "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t" + "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t" + "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t" + "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" + "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" + "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" + + "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t" + "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t" + "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t" + "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t" + "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" + "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" + "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "or %[dest], %[dest], %[dest37] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), + [dest] "=&f"(dest) + : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), + [mask4] "f"(mask4), [shift] "f"(shift) + : "memory"); +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2, + dest3; + uint64_t matrix, matrix_hi, matrix_lo; + uint64_t tmp0, tmp1; + const uint64_t shift0 = 0x06; + const uint64_t shift1 = 0x08; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + + "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psraw %[dest0], %[dest0], %[shift0] \n\t" + + "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psraw %[dest1], %[dest1], %[shift0] \n\t" + + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + + "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest2], %[tmp0], %[tmp1] \n\t" + "psraw %[dest2], %[dest2], %[shift0] \n\t" + + "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest3], %[tmp0], %[tmp1] \n\t" + "psraw %[dest3], %[dest3], %[shift0] \n\t" + + "packsswh %[tmp0], %[dest0], %[dest1] \n\t" + "packsswh %[tmp1], %[dest2], %[dest3] \n\t" + "packushb %[dest], %[tmp0], %[tmp1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest), + [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi), + [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix) + : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb), + [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0), + [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1) + : "memory"); +} + +void ARGBShadeRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[src] \n\t" + "punpckhbh %[src_hi], %[src], %[src] \n\t" + + "punpcklbh %[value], %[value], %[value] \n\t" + + "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src), + [dest] "=&f"(dest) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), + [value] "f"(value), [shift] "f"(shift) + : "memory"); +} + +void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo; + uint64_t dest, dest_lo, dest_hi; + const uint64_t mask = 0x0; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[src0_lo], %[src0], %[src0] \n\t" + "punpckhbh %[src0_hi], %[src0], %[src0] \n\t" + + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src1_lo], %[src1], %[mask] \n\t" + "punpckhbh %[src1_hi], %[src1], %[mask] \n\t" + + "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t" + "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), + [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), + [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) + : "memory"); +} + +void ARGBAddRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "paddusb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [width] "r"(width) + : "memory"); +} + +void ARGBSubtractRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "psubusb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [width] "r"(width) + : "memory"); +} + +// Sobel functions which mimics SSSE3. +void SobelXRow_MMI(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + uint64_t y00 = 0, y10 = 0, y20 = 0; + uint64_t y02 = 0, y12 = 0, y22 = 0; + uint64_t zero = 0x0; + uint64_t sobel = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] + "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2] + "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i] + "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2] + "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" + + "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i] + "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t" + "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2] + "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y20], %[y20], %[zero] \n\t" + + "punpcklbh %[y02], %[y02], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + "punpcklbh %[y22], %[y22], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y10] \n\t" // a+b + "paddh %[y20], %[y20], %[y10] \n\t" // c+b + "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c + + "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub + "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub + "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub + + "pmaxsh %[y10], %[y00], %[y02] \n\t" + "pminsh %[y20], %[y00], %[y02] \n\t" + "psubh %[sobel], %[y10], %[y20] \n\t" // Abs + + "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" + "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" + "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" + "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" + "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" + + "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t" + "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t" + "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t" + "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y20], %[y20], %[zero] \n\t" + + "punpcklbh %[y02], %[y02], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + "punpcklbh %[y22], %[y22], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y10] \n\t" + "paddh %[y20], %[y20], %[y10] \n\t" + "paddh %[y00], %[y00], %[y20] \n\t" + + "paddh %[y02], %[y02], %[y12] \n\t" + "paddh %[y22], %[y22], %[y12] \n\t" + "paddh %[y02], %[y02], %[y22] \n\t" + + "pmaxsh %[y10], %[y00], %[y02] \n\t" + "pminsh %[y20], %[y00], %[y02] \n\t" + "psubh %[y00], %[y10], %[y20] \n\t" + + "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 + "sdc1 %[sobel], 0(%[dst_sobelx]) \n\t" + + "daddiu %[src_y0], %[src_y0], 8 \n\t" + "daddiu %[src_y1], %[src_y1], 8 \n\t" + "daddiu %[src_y2], %[src_y2], 8 \n\t" + "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10), + [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22) + : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2), + [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero) + : "memory"); +} + +void SobelYRow_MMI(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + uint64_t y00 = 0, y01 = 0, y02 = 0; + uint64_t y10 = 0, y11 = 0, y12 = 0; + uint64_t zero = 0x0; + uint64_t sobel = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] + "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" + "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1] + "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2] + "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i] + "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" + "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1] + "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2] + "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y01], %[y01], %[zero] \n\t" + "punpcklbh %[y02], %[y02], %[zero] \n\t" + + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y11], %[y11], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y01] \n\t" // a+b + "paddh %[y02], %[y02], %[y01] \n\t" // c+b + "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c + + "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub + "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub + "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub + + "pmaxsh %[y02], %[y00], %[y10] \n\t" + "pminsh %[y12], %[y00], %[y10] \n\t" + "psubh %[sobel], %[y02], %[y12] \n\t" // Abs + + "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" + "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" + "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t" + "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" + "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" + "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" + "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t" + "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" + "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y01], %[y01], %[zero] \n\t" + "punpcklbh %[y02], %[y02], %[zero] \n\t" + + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y11], %[y11], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y01] \n\t" + "paddh %[y02], %[y02], %[y01] \n\t" + "paddh %[y00], %[y00], %[y02] \n\t" + + "paddh %[y10], %[y10], %[y11] \n\t" + "paddh %[y12], %[y12], %[y11] \n\t" + "paddh %[y10], %[y10], %[y12] \n\t" + + "pmaxsh %[y02], %[y00], %[y10] \n\t" + "pminsh %[y12], %[y00], %[y10] \n\t" + "psubh %[y00], %[y02], %[y12] \n\t" + + "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 + "sdc1 %[sobel], 0(%[dst_sobely]) \n\t" + + "daddiu %[src_y0], %[src_y0], 8 \n\t" + "daddiu %[src_y1], %[src_y1], 8 \n\t" + "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01), [y02] "=&f"(y02), + [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12) + : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [dst_sobely] "r"(dst_sobely), + [width] "r"(width), [zero] "f"(zero) + : "memory"); +} + +void SobelRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + double temp[3]; + uint64_t c1 = 0xff000000ff000000; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i] + "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t" + "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] + "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t" + // s7 s6 s5 s4 s3 s2 s1 s0 = a+b + "paddusb %[t2] , %[t0], %[t1] \n\t" + + // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0 + "punpcklbh %[t0], %[t2], %[t2] \n\t" + + // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0 + "punpcklbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + // 255 s1 s1 s1 s55 s0 s0 s0 + "sdc1 %[t1], 0x00(%[dst_argb]) \n\t" + + // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 + "punpckhbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + // 255 s3 s3 s3 255 s2 s2 s2 + "sdc1 %[t1], 0x08(%[dst_argb]) \n\t" + + // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 + "punpckhbh %[t0], %[t2], %[t2] \n\t" + + // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 + "punpcklbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + "sdc1 %[t1], 0x10(%[dst_argb]) \n\t" + + // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 + "punpckhbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + "sdc1 %[t1], 0x18(%[dst_argb]) \n\t" + + "daddiu %[dst_argb], %[dst_argb], 32 \n\t" + "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" + "daddiu %[src_sobely], %[src_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) + : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), + [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) + : "memory"); +} + +void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + uint64_t tr = 0; + uint64_t tb = 0; + __asm__ volatile( + "1: \n\t" + "ldc1 %[tr], 0x0(%[src_sobelx]) \n\t" // r=src_sobelx[i] + "ldc1 %[tb], 0x0(%[src_sobely]) \n\t" // b=src_sobely[i] + "paddusb %[tr], %[tr], %[tb] \n\t" // g + "sdc1 %[tr], 0x0(%[dst_y]) \n\t" + + "daddiu %[dst_y], %[dst_y], 8 \n\t" + "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" + "daddiu %[src_sobely], %[src_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [tr] "=&f"(tr), [tb] "=&f"(tb) + : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), + [dst_y] "r"(dst_y), [width] "r"(width) + : "memory"); +} + +void SobelXYRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + uint64_t temp[3]; + uint64_t result = 0; + uint64_t gb = 0; + uint64_t cr = 0; + uint64_t c1 = 0xffffffffffffffff; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t" + "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] + "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t" + "paddusb %[tg] , %[tr], %[tb] \n\t" // g + + // g3 b3 g2 b2 g1 b1 g0 b0 + "punpcklbh %[gb], %[tb], %[tg] \n\t" + // c3 r3 r2 r2 c1 r1 c0 r0 + "punpcklbh %[cr], %[tr], %[c1] \n\t" + // c1 r1 g1 b1 c0 r0 g0 b0 + "punpcklhw %[result], %[gb], %[cr] \n\t" + "sdc1 %[result], 0x00(%[dst_argb]) \n\t" + // c3 r3 g3 b3 c2 r2 g2 b2 + "punpckhhw %[result], %[gb], %[cr] \n\t" + "sdc1 %[result], 0x08(%[dst_argb]) \n\t" + + // g7 b7 g6 b6 g5 b5 g4 b4 + "punpckhbh %[gb], %[tb], %[tg] \n\t" + // c7 r7 c6 r6 c5 r5 c4 r4 + "punpckhbh %[cr], %[tr], %[c1] \n\t" + // c5 r5 g5 b5 c4 r4 g4 b4 + "punpcklhw %[result], %[gb], %[cr] \n\t" + "sdc1 %[result], 0x10(%[dst_argb]) \n\t" + // c7 r7 g7 b7 c6 r6 g6 b6 + "punpckhhw %[result], %[gb], %[cr] \n\t" + "sdc1 %[result], 0x18(%[dst_argb]) \n\t" + + "daddiu %[dst_argb], %[dst_argb], 32 \n\t" + "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" + "daddiu %[src_sobely], %[src_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]), + [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result) + : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), + [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) + : "memory"); +} + +void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { + // Copy a Y to RGB. + uint64_t src, dest; + const uint64_t mask0 = 0x00ffffff00ffffffULL; + const uint64_t mask1 = ~mask0; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src], %[src], %[src] \n\t" + "punpcklhw %[dest], %[src], %[src] \n\t" + "and %[dest], %[dest], %[mask0] \n\t" + "or %[dest], %[dest], %[mask1] \n\t" + "sdc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "punpckhhw %[dest], %[src], %[src] \n\t" + "and %[dest], %[dest], %[mask0] \n\t" + "or %[dest], %[dest], %[mask1] \n\t" + "sdc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest) + : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) { + uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x55; + const uint64_t mask2 = 0xAA; + const uint64_t mask3 = 0xFF; + const uint64_t mask4 = 0x4A354A354A354A35ULL; + const uint64_t mask5 = 0x0488048804880488ULL; + const uint64_t shift0 = 0x08; + const uint64_t shift1 = 0x06; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + + "pshufh %[src], %[src_lo], %[mask0] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_lo], %[mask1] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "pshufh %[src], %[src_lo], %[mask2] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_lo], %[mask3] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "pshufh %[src], %[src_hi], %[mask0] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_hi], %[mask1] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" + + "pshufh %[src], %[src_hi], %[mask2] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_hi], %[mask3] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" + + "daddi %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo) + : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), + [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0), + [shift1] "f"(shift1), [width] "r"(width) + : "memory"); +} + +void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + uint64_t source, src0, src1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x1b; + + src += width - 1; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[source], 0(%[src_ptr]) \n\t" + "gsldrc1 %[source], -7(%[src_ptr]) \n\t" + "punpcklbh %[src0], %[source], %[mask0] \n\t" + "pshufh %[src0], %[src0], %[mask1] \n\t" + "punpckhbh %[src1], %[source], %[mask0] \n\t" + "pshufh %[src1], %[src1], %[mask1] \n\t" + "packushb %[dest], %[src1], %[src0] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddi %[src_ptr], %[src_ptr], -0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0), + [src1] "=&f"(src1) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void MirrorUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src0, src1, dest0, dest1; + const uint64_t mask0 = 0x00ff00ff00ff00ffULL; + const uint64_t mask1 = 0x1b; + const uint64_t shift = 0x08; + + src_uv += (width - 1) << 1; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 1(%[src_ptr]) \n\t" + "gsldrc1 %[src0], -6(%[src_ptr]) \n\t" + "gsldlc1 %[src1], -7(%[src_ptr]) \n\t" + "gsldrc1 %[src1], -14(%[src_ptr]) \n\t" + + "and %[dest0], %[src0], %[mask0] \n\t" + "pshufh %[dest0], %[dest0], %[mask1] \n\t" + "and %[dest1], %[src1], %[mask0] \n\t" + "pshufh %[dest1], %[dest1], %[mask1] \n\t" + "packushb %[dest0], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t" + + "psrlh %[dest0], %[src0], %[shift] \n\t" + "pshufh %[dest0], %[dest0], %[mask1] \n\t" + "psrlh %[dest1], %[src1], %[shift] \n\t" + "pshufh %[dest1], %[dest1], %[mask1] \n\t" + "packushb %[dest0], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t" + + "daddi %[src_ptr], %[src_ptr], -0x10 \n\t" + "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t" + "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), + [src1] "=&f"(src1) + : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v), + [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), + [shift] "f"(shift) + : "memory"); +} + +void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + src += (width - 1) * 4; + uint64_t temp = 0x0; + uint64_t shuff = 0x4e; // 01 00 11 10 + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[temp], 3(%[src]) \n\t" + "gsldrc1 %[temp], -4(%[src]) \n\t" + "pshufh %[temp], %[temp], %[shuff] \n\t" + "sdc1 %[temp], 0x0(%[dst]) \n\t" + + "daddiu %[src], %[src], -0x08 \n\t" + "daddiu %[dst], %[dst], 0x08 \n\t" + "daddiu %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [temp] "=&f"(temp) + : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff) + : "memory"); +} + +void SplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[4]; + uint64_t shift = 0x08; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_uv]) \n\t" + "ldc1 %[t1], 0x08(%[src_uv]) \n\t" + + "and %[t2], %[t0], %[c0] \n\t" + "and %[t3], %[t1], %[c0] \n\t" + "packushb %[t2], %[t2], %[t3] \n\t" + "sdc1 %[t2], 0x0(%[dst_u]) \n\t" + + "psrlh %[t2], %[t0], %[shift] \n\t" + "psrlh %[t3], %[t1], %[shift] \n\t" + "packushb %[t2], %[t2], %[t3] \n\t" + "sdc1 %[t2], 0x0(%[dst_v]) \n\t" + + "daddiu %[src_uv], %[src_uv], 16 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), + [t3] "=&f"(temp[3]) + : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +void MergeUVRow_MMI(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + uint64_t temp[3]; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x0(%[src_u]) \n\t" + "ldc1 %[t1], 0x0(%[src_v]) \n\t" + "punpcklbh %[t2], %[t0], %[t1] \n\t" + "sdc1 %[t2], 0x0(%[dst_uv]) \n\t" + "punpckhbh %[t2], %[t0], %[t1] \n\t" + "sdc1 %[t2], 0x8(%[dst_uv]) \n\t" + + "daddiu %[src_u], %[src_u], 8 \n\t" + "daddiu %[src_v], %[src_v], 8 \n\t" + "daddiu %[dst_uv], %[dst_uv], 16 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) + : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v), + [width] "r"(width) + : "memory"); +} + +void SplitRGBRow_MMI(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + uint64_t src[4]; + uint64_t dest_hi, dest_lo, dest; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" + "punpcklbh %[dest_lo], %[src0], %[src1] \n\t" + "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t" + "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t" + "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t" + "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t" + "punpcklbh %[dest_hi], %[src2], %[src3] \n\t" + + "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t" + "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t" + "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t" + "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" + "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t" + "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t" + "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]), + [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g), + [dstb_ptr] "r"(dst_b), [width] "r"(width) + : "memory"); +} + +void MergeRGBRow_MMI(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + uint64_t srcr, srcg, srcb, dest; + uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo; + const uint64_t temp = 0x0; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t" + "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t" + "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t" + "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t" + "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t" + "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t" + + "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t" + "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t" + "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t" + "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t" + + "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" + "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t" + "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" + "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t" + "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" + "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" + "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t" + + "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t" + "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t" + "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb), + [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi), + [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi), + [srcbz_lo] "=&f"(srcbz_lo) + : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b), + [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp) + : "memory"); +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t c0 = 0xff00ff00ff00ff00; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t temp[3]; + uint64_t data[4]; + uint64_t shift = 0x08; + uint64_t src_stride = 0x0; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" + "ldc1 %[t1], 0x00(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "ldc1 %[t2], 0x08(%[src_yuy2]) \n\t" + "ldc1 %[t1], 0x08(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c1] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "ldc1 %[t0], 0x10(%[src_yuy2]) \n\t" + "ldc1 %[t1], 0x10(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "ldc1 %[t2], 0x18(%[src_yuy2]) \n\t" + "ldc1 %[t1], 0x18(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c1] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "sdc1 %[d0], 0x0(%[dst_u]) \n\t" + "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), + [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), + [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) + : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) + : "memory"); +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t c0 = 0xff00ff00ff00ff00; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t temp[2]; + uint64_t data[4]; + uint64_t shift = 0x08; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "ldc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c1] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "ldc1 %[t0], 0x10(%[src_yuy2]) \n\t" + "ldc1 %[t1], 0x18(%[src_yuy2]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c1] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "sdc1 %[d0], 0x0(%[dst_u]) \n\t" + "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), + [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) + : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) + : "memory"); +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[2]; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "ldc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "sdc1 %[t0], 0x0(%[dst_y]) \n\t" + "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" + "daddiu %[dst_y], %[dst_y], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) + : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width), + [c0] "f"(c0) + : "memory"); +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_MMI(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[3]; + uint64_t data[4]; + uint64_t shift = 0x08; + uint64_t src_stride = 0x0; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" + "ldc1 %[t1], 0x00(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "ldc1 %[t2], 0x08(%[src_uyvy]) \n\t" + "ldc1 %[t1], 0x08(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c0] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "ldc1 %[t0], 0x10(%[src_uyvy]) \n\t" + "ldc1 %[t1], 0x10(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "ldc1 %[t2], 0x18(%[src_uyvy]) \n\t" + "ldc1 %[t1], 0x18(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c0] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "sdc1 %[d0], 0x0(%[dst_u]) \n\t" + "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), + [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), + [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) + : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[2]; + uint64_t data[4]; + uint64_t shift = 0x08; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "ldc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c0] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "ldc1 %[t0], 0x10(%[src_uyvy]) \n\t" + "ldc1 %[t1], 0x18(%[src_uyvy]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c0] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "sdc1 %[d0], 0x0(%[dst_u]) \n\t" + "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), + [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) + : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + // Output a row of Y values. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t shift = 0x08; + uint64_t temp[2]; + __asm__ volatile( + "1: \n\t" + "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "ldc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "dsrl %[t0], %[t0], %[shift] \n\t" + "dsrl %[t1], %[t1], %[shift] \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "sdc1 %[t0], 0x0(%[dst_y]) \n\t" + "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" + "daddiu %[dst_y], %[dst_y], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) + : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width), + [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi, + dest_lo; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL; + const uint64_t mask2 = 0x00FF00FF00FF00FFULL; + const uint64_t mask3 = 0xFF; + const uint64_t mask4 = ~mask1; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" + + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" + + "psubush %[alpha], %[mask2], %[src0_lo] \n\t" + "pshufh %[alpha], %[alpha], %[mask3] \n\t" + "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t" + + "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" + "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" + + "psubush %[alpha], %[mask2], %[src0_hi] \n\t" + "pshufh %[alpha], %[alpha], %[mask3] \n\t" + "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[mask4] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha), + [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), + [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), + [shift] "f"(shift), [width] "r"(width) + : "memory"); +} + +void BlendPlaneRow_MMI(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + uint64_t source0, source1, dest, alph; + uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi, + dest_lo; + uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL; + const uint64_t mask2 = 0x00FF00FF00FF00FFULL; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" + "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" + + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" + "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" + + "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t" + "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t" + "psubusb %[alpha_r], %[mask1], %[alpha] \n\t" + "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t" + "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t" + "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t" + "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t" + + "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t" + "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t" + "paddush %[dest_lo], %[dest_lo], %[dest] \n\t" + "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + + "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t" + "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t" + "paddush %[dest_hi], %[dest_hi], %[dest] \n\t" + "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph), + [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), + [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), + [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi), + [alpha_r] "=&f"(alpha_rev) + : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha), + [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) + : "memory"); +} + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha; + const uint64_t mask0 = 0xFF; + const uint64_t mask1 = 0xFF000000FF000000ULL; + const uint64_t mask2 = ~mask1; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[src] \n\t" + "punpckhbh %[src_hi], %[src], %[src] \n\t" + + "pshufh %[alpha], %[src_lo], %[mask0] \n\t" + "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + "pshufh %[alpha], %[src_hi], %[mask0] \n\t" + "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "and %[dest], %[dest], %[mask2] \n\t" + "and %[src], %[src], %[mask1] \n\t" + "or %[dest], %[dest], %[src] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), + [width] "r"(width) + : "memory"); +} + +void ComputeCumulativeSumRow_MMI(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + int64_t row_sum[2] = {0, 0}; + uint64_t src, dest0, dest1, presrc0, presrc1, dest; + const uint64_t mask = 0x0; + + __asm__ volatile( + "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t" + "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t" + + "1: \n\t" + "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t" + "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t" + + "punpcklbh %[src], %[src], %[mask] \n\t" + "punpcklhw %[dest0], %[src], %[mask] \n\t" + "punpckhhw %[dest1], %[src], %[mask] \n\t" + + "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t" + "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t" + + "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t" + "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t" + "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t" + "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t" + + "paddw %[dest0], %[row_sum0], %[presrc0] \n\t" + "paddw %[dest1], %[row_sum1], %[presrc1] \n\t" + + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t" + "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x01 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]), + [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0), + [presrc1] "=&f"(presrc1) + : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum), + [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask) + : "memory"); +} + +// C version 2x2 -> 2x1. +void InterpolateRow_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + if (source_y_fraction == 0) { + __asm__ volatile( + "1: \n\t" + "ld $t0, 0x0(%[src_ptr]) \n\t" + "sd $t0, 0x0(%[dst_ptr]) \n\t" + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : + : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), + [width] "r"(width) + : "memory"); + return; + } + if (source_y_fraction == 128) { + uint64_t uv = 0x0; + uint64_t uv_stride = 0x0; + __asm__ volatile( + "1: \n\t" + "ldc1 %[uv], 0x0(%[src_ptr]) \n\t" + "daddu $t0, %[src_ptr], %[stride] \n\t" + "ldc1 %[uv_stride], 0x0($t0) \n\t" + + "pavgb %[uv], %[uv], %[uv_stride] \n\t" + "sdc1 %[uv], 0x0(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), + [stride] "r"((int64_t)src_stride) + : "memory"); + return; + } + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint64_t temp; + uint64_t data[4]; + uint64_t zero = 0x0; + uint64_t c0 = 0x0080008000800080; + uint64_t fy0 = 0x0100010001000100; + uint64_t shift = 0x8; + __asm__ volatile( + "pshufh %[fy1], %[fy1], %[zero] \n\t" + "psubh %[fy0], %[fy0], %[fy1] \n\t" + "1: \n\t" + "ldc1 %[t0], 0x0(%[src_ptr]) \n\t" + "punpcklbh %[d0], %[t0], %[zero] \n\t" + "punpckhbh %[d1], %[t0], %[zero] \n\t" + "ldc1 %[t0], 0x0(%[src_ptr1]) \n\t" + "punpcklbh %[d2], %[t0], %[zero] \n\t" + "punpckhbh %[d3], %[t0], %[zero] \n\t" + + "pmullh %[d0], %[d0], %[fy0] \n\t" + "pmullh %[d2], %[d2], %[fy1] \n\t" + "paddh %[d0], %[d0], %[d2] \n\t" + "paddh %[d0], %[d0], %[c0] \n\t" + "psrlh %[d0], %[d0], %[shift] \n\t" + + "pmullh %[d1], %[d1], %[fy0] \n\t" + "pmullh %[d3], %[d3], %[fy1] \n\t" + "paddh %[d1], %[d1], %[d3] \n\t" + "paddh %[d1], %[d1], %[c0] \n\t" + "psrlh %[d1], %[d1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d1] \n\t" + "sdc1 %[d0], 0x0(%[dst_ptr]) \n\t" + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), + [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) + : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1), [dst_ptr] "r"(dst_ptr), + [width] "r"(width), [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), + [c0] "f"(c0), [shift] "f"(shift), [zero] "f"(zero) + : "memory"); +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + uint64_t source, dest0, dest1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) | + ((shuffler[2] & 0x03) << 4) | + ((shuffler[3] & 0x03) << 6); + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[dest0], %[src], %[mask0] \n\t" + "pshufh %[dest0], %[dest0], %[mask1] \n\t" + "punpckhbh %[dest1], %[src], %[mask0] \n\t" + "pshufh %[dest1], %[dest1], %[mask1] \n\t" + "packushb %[dest], %[dest0], %[dest1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void I422ToYUY2Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + uint64_t temp[3]; + uint64_t vu = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] + "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] + "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] + "punpcklbh %[vu], %[tu], %[tv] \n\t" // g + "punpcklbh %[tu], %[ty], %[vu] \n\t" // g + "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" + "punpckhbh %[tu], %[ty], %[vu] \n\t" // g + "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" + "daddiu %[src_y], %[src_y], 8 \n\t" + "daddiu %[src_u], %[src_u], 4 \n\t" + "daddiu %[src_v], %[src_v], 4 \n\t" + "daddiu %[dst_frame], %[dst_frame], 16 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), + [vu] "=&f"(vu) + : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), + [dst_frame] "r"(dst_frame), [width] "r"(width) + : "memory"); +} + +void I422ToUYVYRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + uint64_t temp[3]; + uint64_t vu = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] + "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] + "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] + "punpcklbh %[vu], %[tu], %[tv] \n\t" // g + "punpcklbh %[tu], %[vu], %[ty] \n\t" // g + "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" + "punpckhbh %[tu], %[vu], %[ty] \n\t" // g + "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" + "daddiu %[src_y], %[src_y], 8 \n\t" + "daddiu %[src_u], %[src_u], 4 \n\t" + "daddiu %[src_v], %[src_v], 4 \n\t" + "daddiu %[dst_frame], %[dst_frame], 16 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), + [vu] "=&f"(vu) + : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), + [dst_frame] "r"(dst_frame), [width] "r"(width) + : "memory"); +} + +void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + uint64_t source, dest; + const uint64_t mask0 = 0xff000000ff000000ULL; + const uint64_t mask1 = ~mask0; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "and %[src], %[src], %[mask0] \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[src], %[dest] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(source), [dest] "=&f"(dest) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + uint64_t src, dest0, dest1, dest_lo, dest_hi, dest; + const uint64_t mask = 0xff000000ff000000ULL; + const uint64_t shift = 0x18; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "and %[dest0], %[src], %[mask] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" + "and %[dest1], %[src], %[mask] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" + + "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" + "and %[dest0], %[src], %[mask] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" + "and %[dest1], %[src], %[mask] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask), + [shift] "f"(shift), [width] "r"(width) + : "memory"); +} + +void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + uint64_t source, dest0, dest1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x00ffffff00ffffffULL; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[dest0], %[mask0], %[src] \n\t" + "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "punpckhbh %[dest0], %[mask0], %[src] \n\t" + "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" + "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale.cc b/source/scale.cc index 2cfa1c6c..6130744d 100644..100755 --- a/source/scale.cc +++ b/source/scale.cc @@ -118,6 +118,21 @@ static void ScalePlaneDown2(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MMI + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI + : ScaleRowDown2Box_Any_MMI); + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MMI + : ScaleRowDown2Box_MMI); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -169,6 +184,15 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_SSE2); } #endif +#if defined(HAS_SCALEROWDOWN2_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_16_MMI + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_MMI + : ScaleRowDown2Box_16_MMI); + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -241,6 +265,15 @@ static void ScalePlaneDown4(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN4_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -284,6 +317,12 @@ static void ScalePlaneDown4_16(int src_width, filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif +#if defined(HAS_SCALEROWDOWN4_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI; + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -849,6 +888,14 @@ static void ScalePlaneBox(int src_width, } } #endif +#if defined(HAS_SCALEADDROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleAddRow = ScaleAddRow_Any_MMI; + if (IS_ALIGNED(src_width, 8)) { + ScaleAddRow = ScaleAddRow_MMI; + } + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; @@ -904,6 +951,11 @@ static void ScalePlaneBox_16(int src_width, } #endif +#if defined(HAS_SCALEADDROW_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) { + ScaleAddRow = ScaleAddRow_16_MMI; + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; @@ -988,6 +1040,14 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1207,6 +1267,11 @@ void ScalePlaneBilinearUp(int src_width, ScaleFilterCols = ScaleColsUp2_SSE2; } #endif +#if defined(HAS_SCALECOLS_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_MMI; + } +#endif } if (y > max_y) { @@ -1334,6 +1399,11 @@ void ScalePlaneBilinearUp_16(int src_width, ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif +#if defined(HAS_SCALECOLS_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_16_MMI; + } +#endif } if (y > max_y) { @@ -1419,6 +1489,11 @@ static void ScalePlaneSimple(int src_width, ScaleCols = ScaleColsUp2_SSE2; } #endif +#if defined(HAS_SCALECOLS_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_MMI; + } +#endif } for (i = 0; i < dst_height; ++i) { @@ -1455,6 +1530,11 @@ static void ScalePlaneSimple_16(int src_width, ScaleCols = ScaleColsUp2_16_SSE2; } #endif +#if defined(HAS_SCALECOLS_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_16_MMI; + } +#endif } for (i = 0; i < dst_height; ++i) { diff --git a/source/scale_any.cc b/source/scale_any.cc index 53ad1364..8714c369 100644..100755 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -42,6 +42,9 @@ CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #ifdef HAS_SCALEARGBCOLS_MSA CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) #endif +#ifdef HAS_SCALEARGBCOLS_MMI +CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) +#endif #ifdef HAS_SCALEARGBFILTERCOLS_NEON CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, @@ -165,6 +168,27 @@ SDANY(ScaleRowDown2Box_Any_MSA, 1, 31) #endif +#ifdef HAS_SCALEROWDOWN2_MMI +SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7) +SDANY(ScaleRowDown2Linear_Any_MMI, + ScaleRowDown2Linear_MMI, + ScaleRowDown2Linear_C, + 2, + 1, + 7) +SDANY(ScaleRowDown2Box_Any_MMI, + ScaleRowDown2Box_MMI, + ScaleRowDown2Box_C, + 2, + 1, + 7) +SDODD(ScaleRowDown2Box_Odd_MMI, + ScaleRowDown2Box_MMI, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 7) +#endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSSE3, @@ -201,6 +225,15 @@ SDANY(ScaleRowDown4Box_Any_MSA, 1, 15) #endif +#ifdef HAS_SCALEROWDOWN4_MMI +SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_MMI, + ScaleRowDown4Box_MMI, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif #ifdef HAS_SCALEROWDOWN34_SSSE3 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, @@ -382,6 +415,26 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA, 4, 3) #endif +#ifdef HAS_SCALEARGBROWDOWN2_MMI +SDANY(ScaleARGBRowDown2_Any_MMI, + ScaleARGBRowDown2_MMI, + ScaleARGBRowDown2_C, + 2, + 4, + 1) +SDANY(ScaleARGBRowDown2Linear_Any_MMI, + ScaleARGBRowDown2Linear_MMI, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 1) +SDANY(ScaleARGBRowDown2Box_Any_MMI, + ScaleARGBRowDown2Box_MMI, + ScaleARGBRowDown2Box_C, + 2, + 4, + 1) +#endif #undef SDANY // Scale down by even scale factor. @@ -433,6 +486,18 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, 4, 3) #endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI +SDAANY(ScaleARGBRowDownEven_Any_MMI, + ScaleARGBRowDownEven_MMI, + ScaleARGBRowDownEven_C, + 4, + 1) +SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, + ScaleARGBRowDownEvenBox_MMI, + ScaleARGBRowDownEvenBox_C, + 4, + 1) +#endif // Add rows box filter scale down. #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ @@ -456,6 +521,9 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #ifdef HAS_SCALEADDROW_MSA SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) #endif +#ifdef HAS_SCALEADDROW_MMI +SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) +#endif #undef SAANY #ifdef __cplusplus diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 53a22e8b..beef380a 100644..100755 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -111,6 +111,22 @@ static void ScaleARGBDown2(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MMI + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI + : ScaleARGBRowDown2Box_Any_MMI); + if (IS_ALIGNED(dst_width, 2)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MMI + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI + : ScaleARGBRowDown2Box_MMI); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -237,6 +253,16 @@ static void ScaleARGBDownEven(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI + : ScaleARGBRowDownEven_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -418,6 +444,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif if (src_width >= 32768) { ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; @@ -464,6 +498,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MMI) + if (!filtering && TestCpuFlag(kCpuHasMMI)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleARGBFilterCols = ScaleARGBCols_MMI; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -471,6 +513,11 @@ static void ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; + } +#endif } if (y > max_y) { @@ -666,6 +713,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MMI) + if (!filtering && TestCpuFlag(kCpuHasMMI)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleARGBFilterCols = ScaleARGBCols_MMI; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -673,6 +728,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; + } +#endif } const int max_y = (src_height - 1) << 16; @@ -797,6 +857,14 @@ static void ScaleARGBSimple(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBCols = ScaleARGBCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleARGBCols = ScaleARGBCols_MMI; + } + } +#endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -804,6 +872,11 @@ static void ScaleARGBSimple(int src_width, ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBColsUp2_MMI; + } +#endif } for (j = 0; j < dst_height; ++j) { diff --git a/source/scale_common.cc b/source/scale_common.cc index b28d7da4..106b482a 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1073,6 +1073,14 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(dst_width_bytes, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif for (j = 0; j < dst_height; ++j) { int yi; int yf; diff --git a/source/scale_mmi.cc b/source/scale_mmi.cc new file mode 100644 index 00000000..604397f7 --- /dev/null +++ b/source/scale_mmi.cc @@ -0,0 +1,1128 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +// CPU agnostic row functions +void ScaleRowDown2_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1, dest; + const uint64_t shift = 0x8ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "psrlh %[src0], %[src0], %[shift] \n\t" + + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "psrlh %[src1], %[src1], %[shift] \n\t" + + "packushb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift] "f"(shift) + : "memory"); +} + +void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest0, dest1; + + const uint64_t mask = 0x00ff00ff00ff00ffULL; + const uint64_t shift = 0x8ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "and %[dest0], %[src0], %[mask] \n\t" + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "and %[dest1], %[src1], %[mask] \n\t" + "packushb %[dest0], %[dest0], %[dest1] \n\t" + + "psrlh %[src0], %[src0], %[shift] \n\t" + "psrlh %[src1], %[src1], %[shift] \n\t" + "packushb %[dest1], %[src0], %[src1] \n\t" + + "pavgb %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask), + [shift] "f"(shift), [width] "r"(dst_width) + : "memory"); +} + +void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + + uint64_t s0, s1, t0, t1; + uint64_t dest, dest0, dest1; + + const uint64_t ph = 0x0002000200020002ULL; + const uint64_t mask = 0x00ff00ff00ff00ffULL; + const uint64_t shift0 = 0x2ULL; + const uint64_t shift1 = 0x8ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[s0], 0x00(%[s]) \n\t" + "psrlh %[s1], %[s0], %[shift1] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "ldc1 %[t0], 0x00(%[t]) \n\t" + "psrlh %[t1], %[t0], %[shift1] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddh %[dest0], %[s0], %[s1] \n\t" + "paddh %[dest0], %[dest0], %[t0] \n\t" + "paddh %[dest0], %[dest0], %[t1] \n\t" + "paddh %[dest0], %[dest0], %[ph] \n\t" + "psrlh %[dest0], %[dest0], %[shift0] \n\t" + + "ldc1 %[s0], 0x08(%[s]) \n\t" + "psrlh %[s1], %[s0], %[shift1] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "ldc1 %[t0], 0x08(%[t]) \n\t" + "psrlh %[t1], %[t0], %[shift1] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddh %[dest1], %[s0], %[s1] \n\t" + "paddh %[dest1], %[dest1], %[t0] \n\t" + "paddh %[dest1], %[dest1], %[t1] \n\t" + "paddh %[dest1], %[dest1], %[ph] \n\t" + "psrlh %[dest1], %[dest1], %[shift0] \n\t" + + "packushb %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[s], %[s], 0x10 \n\t" + "daddiu %[t], %[t], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) + : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), + [mask] "f"(mask) + : "memory"); +} + +void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "punpckhwd %[dest], %[src0], %[src1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + __asm__ volatile( + "1: \n\t" + "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" + "lwc1 %[src1], 0x08(%[src_ptr]) \n\t" + "punpcklwd %[dest_lo], %[src0], %[src1] \n\t" + "lwc1 %[src0], 0x04(%[src_ptr]) \n\t" + "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t" + "punpcklwd %[dest_hi], %[src0], %[src1] \n\t" + + "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + + uint64_t s0, s_hi, s_lo; + uint64_t t0, t_hi, t_lo; + uint64_t dest, dest_hi, dest_lo; + + const uint64_t mask = 0x0ULL; + const uint64_t ph = 0x0002000200020002ULL; + const uint64_t shfit = 0x2ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[s0], 0x00(%[s]) \n\t" + "punpcklbh %[s_lo], %[s0], %[mask] \n\t" + "punpckhbh %[s_hi], %[s0], %[mask] \n\t" + "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" + + "ldc1 %[t0], 0x00(%[t]) \n\t" + "punpcklbh %[t_lo], %[t0], %[mask] \n\t" + "punpckhbh %[t_hi], %[t0], %[mask] \n\t" + "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" + "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t" + + "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" + + "ldc1 %[s0], 0x08(%[s]) \n\t" + "punpcklbh %[s_lo], %[s0], %[mask] \n\t" + "punpckhbh %[s_hi], %[s0], %[mask] \n\t" + "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" + + "ldc1 %[t0], 0x08(%[t]) \n\t" + "punpcklbh %[t_lo], %[t0], %[mask] \n\t" + "punpckhbh %[t_hi], %[t0], %[mask] \n\t" + "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" + "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t" + + "paddh %[dest_hi], %[dest_hi], %[ph] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[s], %[s], 0x10 \n\t" + "daddiu %[t], %[t], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), + [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest) + : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), + [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit) + : "memory"); +} + +void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1, dest; + const uint64_t shift = 0x10ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "psrlw %[src0], %[src0], %[shift] \n\t" + + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "psrlw %[src1], %[src1], %[shift] \n\t" + + "packsswh %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift] "f"(shift) + : "memory"); +} + +void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" + "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" + + "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t" + "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t" + + "pavgh %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width) + : "memory"); +} + +void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + + uint64_t s0, s1, s_hi, s_lo; + uint64_t t0, t1, t_hi, t_lo; + uint64_t dest, dest0, dest1; + + const uint64_t ph = 0x0000000200000002ULL; + const uint64_t mask = 0x0000ffff0000ffffULL; + const uint64_t shift0 = 0x10ULL; + const uint64_t shift1 = 0x2ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[s0], 0x00(%[s]) \n\t" + "psrlw %[s1], %[s0], %[shift0] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "ldc1 %[t0], 0x00(%[t]) \n\t" + "psrlw %[t1], %[t0], %[shift0] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddw %[dest0], %[s0], %[s1] \n\t" + "paddw %[dest0], %[dest0], %[t0] \n\t" + "paddw %[dest0], %[dest0], %[t1] \n\t" + "paddw %[dest0], %[dest0], %[ph] \n\t" + "psrlw %[dest0], %[dest0], %[shift1] \n\t" + + "ldc1 %[s0], 0x08(%[s]) \n\t" + "psrlw %[s1], %[s0], %[shift0] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "ldc1 %[t0], 0x08(%[t]) \n\t" + "psrlw %[t1], %[t0], %[shift0] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddw %[dest1], %[s0], %[s1] \n\t" + "paddw %[dest1], %[dest1], %[t0] \n\t" + "paddw %[dest1], %[dest1], %[t1] \n\t" + "paddw %[dest1], %[dest1], %[ph] \n\t" + "psrlw %[dest1], %[dest1], %[shift1] \n\t" + + "packsswh %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[s], %[s], 0x10 \n\t" + "daddiu %[t], %[t], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), + [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), + [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), + [dest] "=&f"(dest) + : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), + [mask] "f"(mask) + : "memory"); +} + +void ScaleRowDown4_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + const uint64_t shift = 0x10ULL; + const uint64_t mask = 0x000000ff000000ffULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "psrlw %[src0], %[src0], %[shift] \n\t" + "and %[src0], %[src0], %[mask] \n\t" + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "psrlw %[src1], %[src1], %[shift] \n\t" + "and %[src1], %[src1], %[mask] \n\t" + "packsswh %[dest_lo], %[src0], %[src1] \n\t" + + "ldc1 %[src0], 0x10(%[src_ptr]) \n\t" + "psrlw %[src0], %[src0], %[shift] \n\t" + "and %[src0], %[src0], %[mask] \n\t" + "ldc1 %[src1], 0x18(%[src_ptr]) \n\t" + "psrlw %[src1], %[src1], %[shift] \n\t" + "and %[src1], %[src1], %[mask] \n\t" + "packsswh %[dest_hi], %[src0], %[src1] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift] "f"(shift), [mask] "f"(mask) + : "memory"); +} + +void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" + "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" + + "ldc1 %[src0], 0x10(%[src_ptr]) \n\t" + "ldc1 %[src1], 0x18(%[src_ptr]) \n\t" + "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" + "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [mask] "f"(mask) + : "memory"); +} + +#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + +#define DO_SCALEROWDOWN4BOX_LOOP(reg) \ + "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ + "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ + \ + "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + \ + "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \ + "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \ + "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \ + "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \ + "paddh " #reg ", " #reg ", %[ph] \n\t" \ + "psrlh " #reg ", " #reg ", %[shift] \n\t" \ + \ + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ + "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ + "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" + +/* LibYUVScaleTest.ScaleDownBy4_Box */ +void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* src0_ptr = src_ptr; + const uint8_t* src1_ptr = src_ptr + src_stride; + const uint8_t* src2_ptr = src_ptr + src_stride * 2; + const uint8_t* src3_ptr = src_ptr + src_stride * 3; + + uint64_t src, src_hi, src_lo; + uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; + + const uint64_t mask0 = 0x0ULL; + const uint64_t mask1 = 0x0001000100010001ULL; + const uint64_t ph = 0x0008000800080008ULL; + const uint64_t shift = 0x4ULL; + + __asm__ volatile( + "1: \n\t" + + DO_SCALEROWDOWN4BOX_LOOP(%[dest0]) + DO_SCALEROWDOWN4BOX_LOOP(%[dest1]) + DO_SCALEROWDOWN4BOX_LOOP(%[dest2]) + DO_SCALEROWDOWN4BOX_LOOP(%[dest3]) + + "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" + "packsswh %[dest_hi], %[dest2], %[dest3] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) + : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), + [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), + [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), + [ph] "f"(ph), [mask1] "f"(mask1) + : "memory"); +} + +#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + +#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \ + "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ + "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ + \ + "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + \ + "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \ + "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \ + "paddw %[dest], %[dest_hi], %[dest] \n\t" \ + "paddw %[dest], %[dest], %[ph] \n\t" \ + "psraw %[dest], %[dest], %[shift] \n\t" \ + "and " #reg ", %[dest], %[mask1] \n\t" \ + \ + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ + "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ + "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" + +/* LibYUVScaleTest.ScaleDownBy4_Box_16 */ +void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src0_ptr = src_ptr; + const uint16_t* src1_ptr = src_ptr + src_stride; + const uint16_t* src2_ptr = src_ptr + src_stride * 2; + const uint16_t* src3_ptr = src_ptr + src_stride * 3; + + uint64_t src, src_hi, src_lo; + uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; + + const uint64_t mask0 = 0x0ULL; + const uint64_t mask1 = 0x00000000ffffffffULL; + const uint64_t ph = 0x0000000800000008ULL; + const uint64_t shift = 0x04ULL; + + __asm__ volatile( + "1: \n\t" + + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0]) + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1]) + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2]) + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3]) + "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t" + "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) + : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), + [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), + [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), + [ph] "f"(ph), [mask1] "f"(mask1) + : "memory"); +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + uint64_t src, dest; + + (void)x; + (void)dx; + + __asm__ volatile( + "1: \n\t" + "lwc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[dest], %[src], %[src] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) + : "memory"); +} + +void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + uint64_t src, dest; + + (void)x; + (void)dx; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklhw %[dest], %[src], %[src] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "punpckhhw %[dest], %[src], %[src] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) + : "memory"); +} + +void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + uint64_t src, src_hi, src_lo, dest0, dest1; + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[mask] \n\t" + + "ldc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "paddush %[dest0], %[dest0], %[src_lo] \n\t" + "ldc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "paddush %[dest1], %[dest1], %[src_hi] \n\t" + + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [src] "=&f"(src) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), + [mask] "f"(mask) + : "memory"); +} + +void ScaleAddRow_16_MMI(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { + uint64_t src, src_hi, src_lo, dest0, dest1; + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklhw %[src_lo], %[src], %[mask] \n\t" + "punpckhhw %[src_hi], %[src], %[mask] \n\t" + + "ldc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "paddw %[dest0], %[dest0], %[src_lo] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + + "ldc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "paddw %[dest1], %[dest1], %[src_hi] \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [src] "=&f"(src) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), + [mask] "f"(mask) + : "memory"); +} + +void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" + "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" + "lwc1 %[src1], 0x00(%[src_ptr]) \n\t" + "punpcklwd %[dest], %[src0], %[src1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), + [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + const uint8_t* src0_ptr = src_argb; + const uint8_t* src1_ptr = src_argb + src_stride; + + uint64_t src0, src1, src_hi, src_lo; + uint64_t dest, dest_hi, dest_lo, dest0, dest1; + + const uint64_t mask = 0x0ULL; + const uint64_t ph = 0x0002000200020002ULL; + const uint64_t shift = 0x2ULL; + + __asm__ volatile( + "1: \n\t" + + "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" + "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" + "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" + + "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src_lo], %[src1], %[mask] \n\t" + "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" + "punpcklbh %[src_hi], %[src1], %[mask] \n\t" + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t" + "paddh %[dest0], %[dest0], %[ph] \n\t" + "psrlh %[dest0], %[dest0], %[shift] \n\t" + + "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" + "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" + + "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" + "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" + "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" + + "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src_lo], %[src1], %[mask] \n\t" + "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" + "punpcklbh %[src_hi], %[src1], %[mask] \n\t" + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t" + "paddh %[dest1], %[dest1], %[ph] \n\t" + "psrlh %[dest1], %[dest1], %[shift] \n\t" + + "packushb %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" + "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), + [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), + [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), + [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask), + [ph] "f"(ph) + : "memory"); +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + + const uint32_t* src_tmp; + + uint64_t dest, offset; + + const uint64_t shift0 = 16; + const uint64_t shift1 = 2; + + __asm__ volatile( + "1: \n\t" + "srav %[offset], %[x], %[shift0] \n\t" + "sllv %[offset], %[offset], %[shift1] \n\t" + "dadd %[src_tmp], %[src_ptr], %[offset] \n\t" + "lwc1 %[dest], 0x00(%[src_tmp]) \n\t" + "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "dadd %[x], %[x], %[dx] \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" + "daddi %[width], %[width], -0x01 \n\t" + "bnez %[width], 1b \n\t" + : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1) + : "memory"); +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + uint64_t src, dest0, dest1; + (void)x; + (void)dx; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklwd %[dest0], %[src], %[src] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "punpckhwd %[dest1], %[src], %[src] \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBFilterCols_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + uint64_t dest, src, src_hi, src_lo; + int xi, xf, nxf; + int64_t fxf, fnxf; + + const uint8_t* src_ptr = src_argb; + + const uint64_t mask0 = 0; + const uint64_t mask1 = 0x7fULL; + + const uint64_t shift2 = 2; + const uint64_t shift9 = 9; + const uint64_t shift7 = 7; + const uint64_t shift16 = 16; + + __asm__ volatile( + "1: \n\t" + "dsrl %[xi], %[x], %[shift16] \n\t" + "dsll %[xi], %[xi], %[shift2] \n\t" + + "dadd %[src_ptr], %[src_argb], %[xi] \n\t" + "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + + "dsrl %[xf], %[x], %[shift9] \n\t" + "andi %[xf], %[xf], 0x7f \n\t" + "xori %[nxf], %[xf], 0x7f \n\t" + "dmtc1 %[xf], %[fxf] \n\t" + "pshufh %[fxf], %[fxf], %[mask0] \n\t" + "dmtc1 %[nxf], %[fnxf] \n\t" + "pshufh %[fnxf], %[fnxf], %[mask0] \n\t" + + "pmullh %[src_lo], %[src_lo], %[fnxf] \n\t" + "pmullh %[src_hi], %[src_hi], %[fxf] \n\t" + "paddh %[dest], %[src_lo], %[src_hi] \n\t" + "psrlh %[dest], %[dest], %[shift7] \n\t" + "packushb %[dest], %[dest], %[mask0] \n\t" + + "dadd %[x], %[x], %[dx] \n\t" + + "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" + "daddi %[width], %[width], -0x01 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [fxf] "=&f"(fxf), [fnxf] "=&f"(fnxf), + [xi] "=&r"(xi), [xf] "=&r"(xf), [nxf] "=&r"(nxf) + : [src_argb] "r"(src_argb), [src_ptr] "r"(src_ptr), + [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), [x] "r"(x), + [dx] "r"(dx), [mask0] "f"(mask0), [mask1] "f"(mask1), + [shift2] "r"(shift2), [shift7] "f"(shift7), [shift9] "r"(shift9), + [shift16] "r"(shift16) + : "memory"); +} + +// Divide num by div and return as 16.16 fixed point result. +/* LibYUVBaseTest.TestFixedDiv */ +int FixedDiv_MIPS(int num, int div) { + int quotient = 0; + const int shift = 16; + + asm( + "dsll %[num], %[num], %[shift] \n\t" + "ddiv %[num], %[div] \t\n" + "mflo %[quo] \t\n" + : [quo] "+&r"(quotient) + : [num] "r"(num), [div] "r"(div), [shift] "r"(shift)); + + return quotient; +} + +// Divide num by div and return as 16.16 fixed point result. +/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */ +int FixedDiv1_MIPS(int num, int div) { + int quotient = 0; + const int shift = 16; + const int val1 = 1; + const int64_t val11 = 0x00010001ULL; + + asm( + "dsll %[num], %[num], %[shift] \n\t" + "dsub %[num], %[num], %[val11] \n\t" + "dsub %[div], %[div], %[val1] \n\t" + "ddiv %[num], %[div] \t\n" + "mflo %[quo] \t\n" + : [quo] "+&r"(quotient) + : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11), + [shift] "r"(shift)); + + return quotient; +} + +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2_ptr = src_ptr + src_stride; + + uint64_t src0, src1; + uint64_t dest, dest04, dest15, dest26, dest37; + uint64_t tmp0, tmp1, tmp2, tmp3; + + const uint64_t mask0 = 0x0003000900030009ULL; + const uint64_t mask1 = 0x0001000300010003ULL; + const uint64_t mask2 = 0x0009000300090003ULL; + const uint64_t mask3 = 0x0003000100030001ULL; + const uint64_t ph = 0x0000000800000008ULL; + const uint64_t shift = 4; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[src0], 0x00(%[src1_ptr]) \n\t" + "pmaddhw %[dest04], %[src0], %[mask0] \n\t" + "ldc1 %[src1], 0x00(%[src2_ptr]) \n\t" + "pmaddhw %[dest], %[src1], %[mask1] \n\t" + "paddw %[dest04], %[dest04], %[dest] \n\t" + "paddw %[dest04], %[dest04], %[ph] \n\t" + "psrlw %[dest04], %[dest04], %[shift] \n\t" + + "pmaddhw %[dest15], %[src0], %[mask2] \n\t" + "pmaddhw %[dest], %[src1], %[mask3] \n\t" + "paddw %[dest15], %[dest15], %[dest] \n\t" + "paddw %[dest15], %[dest15], %[ph] \n\t" + "psrlw %[dest15], %[dest15], %[shift] \n\t" + + "ldc1 %[src0], 0x02(%[src1_ptr]) \n\t" + "pmaddhw %[dest26], %[src0], %[mask0] \n\t" + "ldc1 %[src1], 0x02(%[src2_ptr]) \n\t" + "pmaddhw %[dest], %[src1], %[mask1] \n\t" + "paddw %[dest26], %[dest26], %[dest] \n\t" + "paddw %[dest26], %[dest26], %[ph] \n\t" + "psrlw %[dest26], %[dest26], %[shift] \n\t" + + "pmaddhw %[dest37], %[src0], %[mask2] \n\t" + "pmaddhw %[dest], %[src1], %[mask3] \n\t" + "paddw %[dest37], %[dest37], %[dest] \n\t" + "paddw %[dest37], %[dest37], %[ph] \n\t" + "psrlw %[dest37], %[dest37], %[shift] \n\t" + + /* tmp0 = ( 00 04 02 06 ) */ + "packsswh %[tmp0], %[dest04], %[dest26] \n\t" + /* tmp1 = ( 01 05 03 07 ) */ + "packsswh %[tmp1], %[dest15], %[dest37] \n\t" + + /* tmp2 = ( 00 01 04 05 )*/ + "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t" + /* tmp3 = ( 02 03 06 07 )*/ + "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t" + + /* ( 00 01 02 03 ) */ + "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + /* ( 04 05 06 07 ) */ + "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04), + [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37), + [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), + [tmp3] "=&f"(tmp3), [dest] "=&f"(dest) + : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst), + [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph) + : "memory"); +} + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index c4648bb9..a7991d2b 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -67,6 +67,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { printf("Has MIPS %d\n", has_mips); int has_msa = TestCpuFlag(kCpuHasMSA); printf("Has MSA %d\n", has_msa); + int has_mmi = TestCpuFlag(kCpuHasMMI); + printf("Has MMI %d\n", has_mmi); #endif } diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 08b6cffa..b8994c30 100644..100755 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -437,6 +437,10 @@ extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); +extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, @@ -463,6 +467,13 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { } else { ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); } +#elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + int has_mmi = TestCpuFlag(kCpuHasMMI); + if (has_mmi) { + ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); + } else { + ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); + } #else ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); #endif diff --git a/util/cpuid.c b/util/cpuid.c index 59c65d60..84c06022 100644 --- a/util/cpuid.c +++ b/util/cpuid.c @@ -71,6 +71,8 @@ int main(int argc, const char* argv[]) { if (has_mips) { int has_msa = TestCpuFlag(kCpuHasMSA); printf("Has MSA %x\n", has_msa); + int has_mmi = TestCpuFlag(kCpuHasMMI); + printf("Has MMI %x\n", has_mmi); } if (has_x86) { int has_sse2 = TestCpuFlag(kCpuHasSSE2); |