aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2017-08-09 14:25:38 -0700
committerFrank Barchard <fbarchard@google.com>2017-08-09 22:19:45 +0000
commit83ca1abe09207daae1628fd8f0d4a0debaef96c6 (patch)
tree7500904fa123f34b0e980521a60d6b6d906857f2 /source/row_neon64.cc
parent8676ad7004fbe86a855923938e4db6a83fe40b91 (diff)
downloadlibyuv-83ca1abe09207daae1628fd8f0d4a0debaef96c6.tar.gz
Change ScaleSumSamples to return Sum of Squares
TBR=kjellander@chromium.org BUG=libyuv:717 TEST=LibYUVPlanarTest.TestScaleSumSamples_Opt Change-Id: I5208666f3968c5c4b0f1b0c951f24216d78ee3fe Reviewed-on: https://chromium-review.googlesource.com/607184 Reviewed-by: Cheng Wang <wangcheng@google.com>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc29
1 files changed, 16 insertions, 13 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 456c6ea5..622ff5fb 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width) {
- float fmax;
+ float fsum;
asm volatile(
- "movi v3.4s, #0 \n" // max
- "movi v4.4s, #0 \n" // max
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v1.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v2.4s, v2.4s, %4.s[0] \n" // scale
- "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
- "fmax v3.4s, v3.4s, v1.4s \n" // max
- "fmax v4.4s, v4.4s, v2.4s \n"
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+
"b.gt 1b \n"
- "fmax v3.4s, v3.4s, v4.4s \n" // max
- "fmaxv %s3, v3.4s \n" // signed max acculator
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "fmov %w3, s5 \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
- "=w"(fmax) // %3
+ "=w"(fsum) // %3
: "w"(scale) // %4
- : "cc", "memory", "v1", "v2", "v3", "v4");
- return fmax;
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fsum;
}
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {