diff options
author | Frank Barchard <fbarchard@google.com> | 2023-06-06 15:05:32 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-06-07 00:02:40 +0000 |
commit | b08ccb6a83f5c76d5a9e181b0f65efd33ce8262d (patch) | |
tree | 955ccd5af95f4edc619ed1a79ed7e9ad5979a21d | |
parent | 1602e4c607f3268685eff6ed56b3a0d994f5b3fc (diff) | |
download | libyuv-b08ccb6a83f5c76d5a9e181b0f65efd33ce8262d.tar.gz |
FP16 to FP32 float conversion row function
Bug: None
Change-Id: I97aab6aafd41c3bf36bfbf33fdcc424e5b3fd6e3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4590225
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
-rw-r--r-- | include/libyuv/row.h | 9 | ||||
-rw-r--r-- | source/row_neon64.cc | 40 | ||||
-rw-r--r-- | unit_test/planar_test.cc | 42 |
3 files changed, 90 insertions, 1 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5b244d77..7b866d41 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -6180,7 +6180,14 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, float* dst_ptr, float param, int width); - +// Convert FP16 Half Floats to FP32 Floats +void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 + float* dst, + int width); +// Convert FP32 Floats to FP16 Half Floats +void ConvertFP32ToFP16Row_NEON(const float* src, + uint16_t* dst, // fp16 + int width); void ARGBLumaColorTableRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 74190d61..a341dc13 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3960,6 +3960,46 @@ void ByteToFloatRow_NEON(const uint8_t* src, : "cc", "memory", "v1", "v2", "v3"); } +// Convert FP16 Half Floats to FP32 Floats +void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 + float* dst, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats + "subs %w2, %w2, #8 \n" // 8 floats per loop + "prfm pldl1keep, [%0, 448] \n" + "fcvtl v2.4s, v1.4h \n" // 8 floats + "fcvtl2 v3.4s, v1.8h \n" + "stp q2, q3, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +// Convert FP32 Floats to FP16 Half Floats +void ConvertFP32ToFP16Row_NEON(const float* src, + uint16_t* dst, // fp16 + int width) { + asm volatile( + "1: \n" + "ldp q2, q3, [%0], #32 \n" // load 8 floats + "subs %w2, %w2, #8 \n" // 8 floats per loop + "prfm pldl1keep, [%0, 448] \n" + "fcvtn v1.4h, v2.4s \n" // 8 fp16 halffloats + "fcvtn2 v1.8h, v3.4s \n" + "str q1, [%1], #16 \n" // store 8 fp16 halffloats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + float ScaleMaxSamples_NEON(const float* src, float* dst, float scale, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index ad97b87e..e990f36b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -4468,4 +4468,46 @@ TEST_F(LibYUVPlanarTest, NV21Copy) { free_aligned_buffer_page_end(dst_vu); } +#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \ + defined(__aarch64__) + +TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) { + int i, j; + const int y_plane_size = benchmark_width_ * benchmark_height_; + + align_buffer_page_end(orig_f, y_plane_size * 4); + align_buffer_page_end(orig_y, y_plane_size * 2); + align_buffer_page_end(dst_opt, y_plane_size * 4); + align_buffer_page_end(rec_opt, y_plane_size * 2); + + for (i = 0; i < y_plane_size; ++i) { + ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f; + } + memset(orig_y, 1, y_plane_size * 2); + memset(dst_opt, 2, y_plane_size * 4); + memset(rec_opt, 3, y_plane_size * 2); + + ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y, + y_plane_size); + + for (j = 0; j < benchmark_iterations_; j++) { + ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt, + y_plane_size); + } + + ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt, + y_plane_size); + + for (i = 0; i < y_plane_size; ++i) { + EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]); + } + + free_aligned_buffer_page_end(orig_f); + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(rec_opt); +} + +#endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__) + } // namespace libyuv |