FP16 to FP32 float conversion row function

Bug: None Change-Id: I97aab6aafd41c3bf36bfbf33fdcc424e5b3fd6e3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4590225 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com>
author: Frank Barchard <fbarchard@google.com> 2023-06-06 15:05:32 -0700
committer: libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> 2023-06-07 00:02:40 +0000
commit: b08ccb6a83f5c76d5a9e181b0f65efd33ce8262d (patch)
tree: 955ccd5af95f4edc619ed1a79ed7e9ad5979a21d
parent: 1602e4c607f3268685eff6ed56b3a0d994f5b3fc (diff)
download: libyuv-b08ccb6a83f5c76d5a9e181b0f65efd33ce8262d.tar.gz
3 files changed, 90 insertions, 1 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5b244d77..7b866d41 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -6180,7 +6180,14 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
                              float* dst_ptr,
                              float param,
                              int width);
-
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width);
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width);
 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 74190d61..a341dc13 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3960,6 +3960,46 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       : "cc", "memory", "v1", "v2", "v3");
 }
 
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v1.4h                  \n"  // 8 floats
+      "fcvtl2      v3.4s, v1.8h                  \n"
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 fp16 halffloats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "str         q1, [%1], #16                 \n"  // store 8 fp16 halffloats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
 float ScaleMaxSamples_NEON(const float* src,
                            float* dst,
                            float scale,
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index ad97b87e..e990f36b 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -4468,4 +4468,46 @@ TEST_F(LibYUVPlanarTest, NV21Copy) {
   free_aligned_buffer_page_end(dst_vu);
 }
 
+#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \
+    defined(__aarch64__)
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
+  int i, j;
+  const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+  align_buffer_page_end(orig_f, y_plane_size * 4);
+  align_buffer_page_end(orig_y, y_plane_size * 2);
+  align_buffer_page_end(dst_opt, y_plane_size * 4);
+  align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+  }
+  memset(orig_y, 1, y_plane_size * 2);
+  memset(dst_opt, 2, y_plane_size * 4);
+  memset(rec_opt, 3, y_plane_size * 2);
+
+  ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+                            y_plane_size);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt,
+                              y_plane_size);
+  }
+
+  ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+                            y_plane_size);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_f);
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(rec_opt);
+}
+
+#endif  // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
+
 }  // namespace libyuv
author	Frank Barchard <fbarchard@google.com>	2023-06-06 15:05:32 -0700
committer	libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>	2023-06-07 00:02:40 +0000
commit	b08ccb6a83f5c76d5a9e181b0f65efd33ce8262d (patch)
tree	955ccd5af95f4edc619ed1a79ed7e9ad5979a21d
parent	1602e4c607f3268685eff6ed56b3a0d994f5b3fc (diff)
download	libyuv-b08ccb6a83f5c76d5a9e181b0f65efd33ce8262d.tar.gz