aboutsummaryrefslogtreecommitdiff
path: root/files/source/row_neon64.cc
diff options
context:
space:
mode:
Diffstat (limited to 'files/source/row_neon64.cc')
-rw-r--r--files/source/row_neon64.cc200
1 files changed, 176 insertions, 24 deletions
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
index 24b4520b..f5cbb470 100644
--- a/files/source/row_neon64.cc
+++ b/files/source/row_neon64.cc
@@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
+ );
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
+ );
}
// Copy multiple of 32.
@@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"+r"(width) // %2 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
// SetRow writes 'width' bytes using an 8 bit value repeated.
@@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
+ );
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
#define RGB565TOARGB \
@@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
- );
+ );
}
#define ARGB1555TOARGB \
@@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
#define ARGB4444TOARGB \
@@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
+ );
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
+ );
}
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
- );
+ );
}
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
- );
+ );
}
void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
- );
+ );
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
"+r"(width) // %2
: "r"(shuffler) // %3
: "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
+ );
}
void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28"
- );
+ );
}
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
: "r"(2LL), // %5
"r"(6LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
// SobelY as a matrix is
@@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
: "r"(1LL), // %4
"r"(6LL) // %5
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ );
}
// Caveat - rounds float to half float whereas scaling version truncates.
@@ -2876,6 +2876,158 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ // pixels
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void FloatDivToByteRow_NEON(const float* src_weights,
+ const float* src_values,
+ uint8_t* dst_out,
+ uint8_t* dst_mask,
+ int width) {
+ asm volatile(
+ "movi v0.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights
+ "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values
+ "subs %w4, %w4, #8 \n" // 8 pixels per loop
+
+ "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights
+ "fdiv v2.4s, v4.4s, v2.4s \n"
+
+ "fcvtas v1.4s, v1.4s \n" // float to int
+ "fcvtas v2.4s, v2.4s \n" // float to int
+ "uqxtn v1.4h, v1.4s \n" // 8 shorts
+ "uqxtn2 v1.8h, v2.4s \n"
+ "uqxtn v1.8b, v1.8h \n" // 8 bytes
+
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
+
+ "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
+ "fcmgt v6.4s, v2.4s, v0.4s \n"
+ "uqxtn v5.4h, v5.4s \n" // 8 shorts
+ "uqxtn2 v5.8h, v6.4s \n"
+ "uqxtn v5.8b, v1.8h \n" // 8 bytes
+
+ "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
+
+ "b.gt 1b \n"
+ : "+r"(src_weights), // %0
+ "+r"(src_values), // %1
+ "+r"(dst_out), // %2
+ "+r"(dst_mask), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Convert biplanar UV channel of NV12 to NV21
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
+ "orr v2.16b, v0.16b, v0.16b \n" // move U after V
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus