aboutsummaryrefslogtreecommitdiff
path: root/source/compare_neon.cc
diff options
context:
space:
mode:
authorSadaf Ebrahimi <sadafebrahimi@google.com>2023-08-25 16:27:50 +0000
committerSadaf Ebrahimi <sadafebrahimi@google.com>2023-08-25 16:27:50 +0000
commit678702573531f19ae36847a6a07257aaae623fbe (patch)
tree5d9b28c2e73f9d5e76d9556181a6cf73ac4bd182 /source/compare_neon.cc
parent8ef2efc6534659701bc7114e57133207547f60e7 (diff)
downloadlibyuv-678702573531f19ae36847a6a07257aaae623fbe.tar.gz
Move libyuv/files/ directly under libyuv
Test: TreeHugger Merged-In: I773d1ae01539cc5d200768b526f10b2922567f72 Change-Id: I4ba1f1e781d7fd3ad96639dfdc08f654e45ae3d3
Diffstat (limited to 'source/compare_neon.cc')
-rw-r--r--source/compare_neon.cc96
1 files changed, 96 insertions, 0 deletions
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
new file mode 100644
index 00000000..afdd6012
--- /dev/null
+++ b/source/compare_neon.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
+
+ asm volatile(
+ "vmov.u16 q4, #0 \n" // accumulator
+
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
+
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
+
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "q0", "q1", "q2", "q3", "q4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif