diff options
author | Frank Barchard <fbarchard@google.com> | 2017-08-09 14:25:38 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@google.com> | 2017-08-09 22:19:45 +0000 |
commit | 83ca1abe09207daae1628fd8f0d4a0debaef96c6 (patch) | |
tree | 7500904fa123f34b0e980521a60d6b6d906857f2 /source/row_neon64.cc | |
parent | 8676ad7004fbe86a855923938e4db6a83fe40b91 (diff) | |
download | libyuv-83ca1abe09207daae1628fd8f0d4a0debaef96c6.tar.gz |
Change ScaleSumSamples to return Sum of Squares
TBR=kjellander@chromium.org
BUG=libyuv:717
TEST=LibYUVPlanarTest.TestScaleSumSamples_Opt
Change-Id: I5208666f3968c5c4b0f1b0c951f24216d78ee3fe
Reviewed-on: https://chromium-review.googlesource.com/607184
Reviewed-by: Cheng Wang <wangcheng@google.com>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r-- | source/row_neon64.cc | 29 |
1 files changed, 16 insertions, 13 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 456c6ea5..622ff5fb 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src, float* dst, float scale, int width) { - float fmax; + float fsum; asm volatile( - "movi v3.4s, #0 \n" // max - "movi v4.4s, #0 \n" // max + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %4.s[0] \n" // scale - "fmul v2.4s, v2.4s, %4.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "fmax v3.4s, v3.4s, v1.4s \n" // max - "fmax v4.4s, v4.4s, v2.4s \n" + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" - "fmax v3.4s, v3.4s, v4.4s \n" // max - "fmaxv %s3, v3.4s \n" // signed max acculator + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "fmov %w3, s5 \n" // sum : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 - "=w"(fmax) // %3 + "=w"(fsum) // %3 : "w"(scale) // %4 - : "cc", "memory", "v1", "v2", "v3", "v4"); - return fmax; + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; } void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { |