NV12Scale function using split/merge on UV channal

Bug: libyuv:718, libyuv:838, b/168918847 Change-Id: I78b27baac50f0ce955e00cb6aaf7dfe5a0cb1e3d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2432067 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
author: Frank Barchard <fbarchard@google.com> 2020-09-28 12:41:52 -0700
committer: Commit Bot <commit-bot@chromium.org> 2020-09-28 20:13:21 +0000
commit: 7a52fde1c4eb00790bd647b50842797daa5222e6 (patch)
tree: 6f7c07526562863a0ff2d0b2d76421d762f7868b
parent: d6833cda383bace2c98190fe0df504609c9ae074 (diff)
download: libyuv-7a52fde1c4eb00790bd647b50842797daa5222e6.tar.gz
6 files changed, 232 insertions, 5 deletions
diff --git a/README.chromium b/README.chromium
index a2a6f977..253ac353 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1763
+Version: 1764
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h
index 23ba1634..add5a9eb 100644
--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@@ -145,6 +145,31 @@ int I444Scale_16(const uint16_t* src_y,
                  int dst_height,
                  enum FilterMode filtering);
 
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 1172f9a3..7c2d059e 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1763
+#define LIBYUV_VERSION 1764
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 709f0709..c0541888 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -7064,7 +7064,6 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
       "psrlw      $0xf,%%xmm4                    \n"
       "packuswb   %%xmm4,%%xmm4                  \n"
       "pxor       %%xmm5,%%xmm5                  \n"
-      "1:                                        \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -7111,7 +7110,6 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-      "1:                                        \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -7148,6 +7146,29 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
+void ClampFloatToZero_SSE2(const float* src_x,
+                           float * dst_y,
+                           int width) {
+
+    asm volatile(
+      "pxor      %%xmm1,%%xmm1                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd       (%0),%%xmm0                    \n"  // load float
+      "maxss      %%xmm1, %%xmm0                 \n"  // clamp to zero
+      "add        4, %0                          \n"
+      "movd       %%xmm0, (%1)                   \n"  // store float
+      "add        4, %1                          \n"
+      "sub        $0x4,%2                        \n"  // 1 float per loop
+      "jg         1b                             \n"
+      : "+r"(src_x),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/source/scale.cc b/source/scale.cc
index b17920a6..d26bfec7 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1670,7 +1670,7 @@ void ScalePlane_16(const uint16_t* src,
   }
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
+    // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
                           dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
@@ -1869,6 +1869,69 @@ int I444Scale_16(const uint16_t* src_y,
   return 0;
 }
 
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+// TODO(https://bugs.chromium.org/p/libyuv/issues/detail?id=838): Remove
+// this once libyuv implements NV12Scale and use the libyuv::NV12Scale().
+// This is copy-pasted from
+// webrtc/common_video/libyuv/include/webrtc_libyuv.h
+int NV12Scale(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering) {
+  const int src_chroma_width = (src_width + 1) / 2;
+  const int src_chroma_height = (src_height + 1) / 2;
+
+  if (src_width == dst_width && src_height == dst_height) {
+    // No scaling.
+    libyuv::CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, src_width,
+                      src_height);
+    libyuv::CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv,
+                      src_chroma_width * 2, src_chroma_height);
+    return 0;
+  }
+
+  // Scaling.
+  // Allocate temporary memory for spitting UV planes and scaling them.
+  const int dst_chroma_width = (dst_width + 1) / 2;
+  const int dst_chroma_height = (dst_height + 1) / 2;
+
+  align_buffer_64(tmp_buffer,
+  	              src_chroma_width * src_chroma_height * 2 +
+  	              dst_chroma_width * dst_chroma_height * 2);
+
+  uint8_t* const src_u = tmp_buffer;
+  uint8_t* const src_v = src_u + src_chroma_width * src_chroma_height;
+  uint8_t* const dst_u = src_v + src_chroma_width * src_chroma_height;
+  uint8_t* const dst_v = dst_u + dst_chroma_width * dst_chroma_height;
+
+  // Split source UV plane into separate U and V plane using the temporary data.
+  libyuv::SplitUVPlane(src_uv, src_stride_uv, src_u, src_chroma_width, src_v,
+                       src_chroma_width, src_chroma_width, src_chroma_height);
+
+  // Scale the planes.
+  libyuv::I420Scale(
+      src_y, src_stride_y, src_u, src_chroma_width, src_v, src_chroma_width,
+      src_width, src_height, dst_y, dst_stride_y, dst_u, dst_chroma_width,
+      dst_v, dst_chroma_width, dst_width, dst_height, filtering);
+
+  // Merge the UV planes into the destination.
+  libyuv::MergeUVPlane(dst_u, dst_chroma_width, dst_v, dst_chroma_width, dst_uv,
+                       dst_stride_uv, dst_chroma_width, dst_chroma_height);
+  free_aligned_buffer_64(tmp_buffer);
+  return 0;
+}
+
 // Deprecated api
 LIBYUV_API
 int Scale(const uint8_t* src_y,
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 1508bf25..86ac8164 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -494,6 +494,110 @@ static int I444TestFilter_16(int src_width,
   return max_diff;
 }
 
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int NV12TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv) * 2;
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv * 2;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_uv, src_uv_plane_size);
+  if (!src_y || !src_uv) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_uv, src_uv_plane_size);
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int64_t dst_y_plane_size = (dst_width) * (dst_height);
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv) * 2;
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv * 2;
+
+  align_buffer_page_end(dst_y_c, dst_y_plane_size);
+  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+  align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+  if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv,
+            src_width, src_height, dst_y_c, dst_stride_y, dst_uv_c,
+            dst_stride_uv, dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv,
+              src_width, src_height, dst_y_opt, dst_stride_y, dst_uv_opt,
+              dst_stride_uv, dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  // Report performance of C vs OPT.
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference is not
+  //  over 3.
+  int max_diff = 0;
+  for (i = 0; i < (dst_height); ++i) {
+    for (j = 0; j < (dst_width); ++j) {
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = 0; i < (dst_height_uv); ++i) {
+    for (j = 0; j < (dst_width_uv * 2); ++j) {
+      int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
+                         dst_uv_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_c);
+  free_aligned_buffer_page_end(dst_uv_c);
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_uv_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_uv);
+
+  return max_diff;
+}
+
 // The following adjustments in dimensions ensure the scale factor will be
 // exactly achieved.
 // 2 is chroma subsample.
@@ -532,6 +636,14 @@ static int I444TestFilter_16(int src_width,
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
         benchmark_cpu_info_);                                                 \
     EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) {                 \
+    int diff = NV12TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@@ -617,6 +729,12 @@ TEST_FACTOR(3, 1, 3, 0)
                                  benchmark_iterations_, disable_cpu_flags_,   \
                                  benchmark_cpu_info_);                        \
     EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) {      \
+    int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
 #ifdef ENABLE_SLOW_TESTS
author	Frank Barchard <fbarchard@google.com>	2020-09-28 12:41:52 -0700
committer	Commit Bot <commit-bot@chromium.org>	2020-09-28 20:13:21 +0000
commit	7a52fde1c4eb00790bd647b50842797daa5222e6 (patch)
tree	6f7c07526562863a0ff2d0b2d76421d762f7868b
parent	d6833cda383bace2c98190fe0df504609c9ae074 (diff)
download	libyuv-7a52fde1c4eb00790bd647b50842797daa5222e6.tar.gz