From 678702573531f19ae36847a6a07257aaae623fbe Mon Sep 17 00:00:00 2001
From: Sadaf Ebrahimi <sadafebrahimi@google.com>
Date: Fri, 25 Aug 2023 16:27:50 +0000
Subject: Move libyuv/files/ directly under libyuv

Test: TreeHugger
Merged-In: I773d1ae01539cc5d200768b526f10b2922567f72
Change-Id: I4ba1f1e781d7fd3ad96639dfdc08f654e45ae3d3
---
 util/Makefile         |   9 +
 util/color.cc         | 120 ++++++++++
 util/compare.cc       |  67 ++++++
 util/cpuid.c          | 128 ++++++++++
 util/i444tonv12_eg.cc |  28 +++
 util/psnr.cc          | 291 +++++++++++++++++++++++
 util/psnr.h           |  47 ++++
 util/psnr_main.cc     | 633 ++++++++++++++++++++++++++++++++++++++++++++++++++
 util/ssim.cc          | 364 +++++++++++++++++++++++++++++
 util/ssim.h           |  38 +++
 util/yuvconstants.c   | 106 +++++++++
 util/yuvconvert.cc    | 367 +++++++++++++++++++++++++++++
 12 files changed, 2198 insertions(+)
 create mode 100644 util/Makefile
 create mode 100644 util/color.cc
 create mode 100644 util/compare.cc
 create mode 100644 util/cpuid.c
 create mode 100644 util/i444tonv12_eg.cc
 create mode 100644 util/psnr.cc
 create mode 100644 util/psnr.h
 create mode 100644 util/psnr_main.cc
 create mode 100644 util/ssim.cc
 create mode 100644 util/ssim.h
 create mode 100644 util/yuvconstants.c
 create mode 100644 util/yuvconvert.cc

(limited to 'util')

diff --git a/util/Makefile b/util/Makefile
new file mode 100644
index 00000000..40e74b65
--- /dev/null
+++ b/util/Makefile
@@ -0,0 +1,9 @@
+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
+
+# for MacOS
+# /usr/local/bin/g++-7 -msse2 -O3 -fopenmp -Bstatic -o psnr psnr.cc ssim.cc psnr_main.cc
diff --git a/util/color.cc b/util/color.cc
new file mode 100644
index 00000000..8c3bbefd
--- /dev/null
+++ b/util/color.cc
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// For those MCs that can be represented as kr and kb:
+// Full range
+// float M[3][3]
+// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3]
+// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// Limited range
+// float M[3][3]
+// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3]
+// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+
+// mc bt
+// 1 bt.709      KR = 0.2126; KB = 0.0722
+// 4 fcc         KR = 0.30;   KB = 0.11
+// 6 bt.601      KR = 0.299;  KB = 0.114
+// 7 SMPTE 240M  KR = 0.212;  KB = 0.087
+// 10 bt2020     KR = 0.2627; KB = 0.0593
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126
+//  KB = 0.0722
+
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// // Y contribution to R,G,B.  Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22  /* round(0.34414 * 64) */
+// #define VG 46  /* round(0.71414 * 64) */
+// #define VR 90  /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int round(float v) {
+  return (int)(v + 0.5);
+}
+
+int main(int argc, const char* argv[]) {
+  if (argc < 2) {
+    printf("color kr kb\n");
+    return -1;
+  }
+  float kr = atof(argv[1]);
+  float kb = atof(argv[2]);
+  float kg = 1 - kr - kb;
+
+  float vr = 2 * (1 - kr);
+  float ug = 2 * ((1 - kb) * kb / kg);
+  float vg = 2 * ((1 - kr) * kr / kg);
+  float ub = 2 * (1 - kb);
+
+  printf("Full range\n");
+  printf("R = Y                + V * %5f\n", vr);
+  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+  printf("B = Y + U * %5f\n", ub);
+
+  printf("KR = %4f; ", kr);
+  printf("KB = %4f\n", kb);
+  //  printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  vr = 255.f / 224.f * 2 * (1 - kr);
+  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+  ub = 255.f / 224.f * 2 * (1 - kb);
+
+  printf("Limited range\n");
+  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
+  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+  //  printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  return 0;
+}
diff --git a/util/compare.cc b/util/compare.cc
new file mode 100644
index 00000000..a16613ee
--- /dev/null
+++ b/util/compare.cc
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/version.h"
+
+int main(int argc, char** argv) {
+  if (argc < 1) {
+    printf("libyuv compare v%d\n", LIBYUV_VERSION);
+    printf("compare file1.yuv file2.yuv\n");
+    return -1;
+  }
+  char* name1 = argv[1];
+  char* name2 = (argc > 2) ? argv[2] : NULL;
+  FILE* fin1 = fopen(name1, "rb");
+  FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
+
+  const int kBlockSize = 32768;
+  uint8_t buf1[kBlockSize];
+  uint8_t buf2[kBlockSize];
+  uint32_t hash1 = 5381;
+  uint32_t hash2 = 5381;
+  uint64_t sum_square_err = 0;
+  uint64_t size_min = 0;
+  int amt1 = 0;
+  int amt2 = 0;
+  do {
+    amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
+    if (amt1 > 0) {
+      hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+    }
+    if (fin2) {
+      amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
+      if (amt2 > 0) {
+        hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+      }
+      int amt_min = (amt1 < amt2) ? amt1 : amt2;
+      size_min += amt_min;
+      sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
+    }
+  } while (amt1 > 0 || amt2 > 0);
+
+  printf("hash1 %x", hash1);
+  if (fin2) {
+    printf(", hash2 %x", hash2);
+    double mse =
+        static_cast<double>(sum_square_err) / static_cast<double>(size_min);
+    printf(", mse %.2f", mse);
+    double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
+    printf(", psnr %.2f\n", psnr);
+    fclose(fin2);
+  }
+  fclose(fin1);
+}
diff --git a/util/cpuid.c b/util/cpuid.c
new file mode 100644
index 00000000..edc6a26e
--- /dev/null
+++ b/util/cpuid.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+
+#ifdef __cplusplus
+using namespace libyuv;
+#endif
+
+int main(int argc, const char* argv[]) {
+  int cpu_flags = TestCpuFlag(-1);
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
+  (void)argc;
+  (void)argv;
+
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+  if (has_x86) {
+    int family, model, cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, &cpu_info[0]);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, &cpu_info[0]);
+    family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+  }
+#endif
+  printf("Cpu Flags 0x%x\n", cpu_flags);
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    printf("Has ARM 0x%x\n", has_arm);
+    printf("Has NEON 0x%x\n", has_neon);
+  }
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
+  }
+  if (has_mips) {
+    int has_msa = TestCpuFlag(kCpuHasMSA);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
+  }
+  if (has_loongarch) {
+    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lasx = TestCpuFlag(kCpuHasLASX);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
+  }
+  if (has_x86) {
+    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+    int has_avx = TestCpuFlag(kCpuHasAVX);
+    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+    int has_erms = TestCpuFlag(kCpuHasERMS);
+    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+    int has_f16c = TestCpuFlag(kCpuHasF16C);
+    int has_gfni = TestCpuFlag(kCpuHasGFNI);
+    int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+    int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+    int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
+    int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+    int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+    int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+    int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
+    printf("Has X86 0x%x\n", has_x86);
+    printf("Has SSE2 0x%x\n", has_sse2);
+    printf("Has SSSE3 0x%x\n", has_ssse3);
+    printf("Has SSE4.1 0x%x\n", has_sse41);
+    printf("Has SSE4.2 0x%x\n", has_sse42);
+    printf("Has AVX 0x%x\n", has_avx);
+    printf("Has AVX2 0x%x\n", has_avx2);
+    printf("Has ERMS 0x%x\n", has_erms);
+    printf("Has FMA3 0x%x\n", has_fma3);
+    printf("Has F16C 0x%x\n", has_f16c);
+    printf("Has GFNI 0x%x\n", has_gfni);
+    printf("Has AVX512BW 0x%x\n", has_avx512bw);
+    printf("Has AVX512VL 0x%x\n", has_avx512vl);
+    printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+    printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+    printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+    printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+    printf("Has AVX512VPOPCNTDQ 0x%x\n", has_avx512vpopcntdq);
+  }
+  return 0;
+}
+
diff --git a/util/i444tonv12_eg.cc b/util/i444tonv12_eg.cc
new file mode 100644
index 00000000..0fcb4095
--- /dev/null
+++ b/util/i444tonv12_eg.cc
@@ -0,0 +1,28 @@
+
+#include "libyuv/convert.h"
+
+#include <stdio.h>   // for printf
+#include <string.h>  // for memset
+
+int main(int, char**) {
+  unsigned char src_i444[640 * 400 * 3];
+  unsigned char dst_nv12[640 * 400 * 3 / 2];
+
+  for (size_t i = 0; i < sizeof(src_i444); ++i) {
+    src_i444[i] = i & 255;
+  }
+  memset(dst_nv12, 0, sizeof(dst_nv12));
+  libyuv::I444ToNV12(&src_i444[0], 640,              // source Y
+                     &src_i444[640 * 400], 640,      // source U
+                     &src_i444[640 * 400 * 2], 640,  // source V
+                     &dst_nv12[0], 640,              // dest Y
+                     &dst_nv12[640 * 400], 640,      // dest UV
+                     640, 400);                      // width and height
+
+  int checksum = 0;
+  for (size_t i = 0; i < sizeof(dst_nv12); ++i) {
+    checksum += dst_nv12[i];
+  }
+  printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL");
+  return 0;
+}
\ No newline at end of file
diff --git a/util/psnr.cc b/util/psnr.cc
new file mode 100644
index 00000000..c7bee7f9
--- /dev/null
+++ b/util/psnr.cc
@@ -0,0 +1,291 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./psnr.h"  // NOLINT
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#ifdef _MSC_VER
+#include <intrin.h>  // For __cpuid()
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32_t;  // NOLINT
+#ifdef _MSC_VER
+typedef unsigned __int64 uint64_t;
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64_t;  // NOLINT
+#else   // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64_t;  // NOLINT
+#endif  // __LP64__
+#endif  // _MSC_VER
+
+// libyuv provides this function when linking library for jpeg support.
+#if !defined(HAVE_JPEG)
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                                    const uint8_t* src_b,
+                                    int count) {
+  volatile uint32_t sse;
+  asm volatile(
+      "vmov.u8    q7, #0                         \n"
+      "vmov.u8    q9, #0                         \n"
+      "vmov.u8    q8, #0                         \n"
+      "vmov.u8    q10, #0                        \n"
+
+      "1:                                        \n"
+      "vld1.u8    {q0}, [%0]!                    \n"
+      "vld1.u8    {q1}, [%1]!                    \n"
+      "vsubl.u8   q2, d0, d2                     \n"
+      "vsubl.u8   q3, d1, d3                     \n"
+      "vmlal.s16  q7, d4, d4                     \n"
+      "vmlal.s16  q8, d6, d6                     \n"
+      "vmlal.s16  q8, d5, d5                     \n"
+      "vmlal.s16  q10, d7, d7                    \n"
+      "subs       %2, %2, #16                    \n"
+      "bhi        1b                             \n"
+
+      "vadd.u32   q7, q7, q8                     \n"
+      "vadd.u32   q9, q9, q10                    \n"
+      "vadd.u32   q10, q7, q9                    \n"
+      "vpaddl.u32 q1, q10                        \n"
+      "vadd.u64   d0, d2, d3                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
+  return sse;
+}
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                                    const uint8_t* src_b,
+                                    int count) {
+  volatile uint32_t sse;
+  asm volatile(
+      "eor        v16.16b, v16.16b, v16.16b      \n"
+      "eor        v18.16b, v18.16b, v18.16b      \n"
+      "eor        v17.16b, v17.16b, v17.16b      \n"
+      "eor        v19.16b, v19.16b, v19.16b      \n"
+
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"
+      "ld1        {v1.16b}, [%1], #16            \n"
+      "subs       %w2, %w2, #16                  \n"
+      "usubl      v2.8h, v0.8b, v1.8b            \n"
+      "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "smlal      v16.4s, v2.4h, v2.4h           \n"
+      "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "smlal2     v18.4s, v2.8h, v2.8h           \n"
+      "smlal2     v19.4s, v3.8h, v3.8h           \n"
+      "b.gt       1b                             \n"
+
+      "add        v16.4s, v16.4s, v17.4s         \n"
+      "add        v18.4s, v18.4s, v19.4s         \n"
+      "add        v19.4s, v16.4s, v18.4s         \n"
+      "addv       s0, v19.4s                     \n"
+      "fmov       %w3, s0                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_SUMSQUAREERROR_SSE2
+__declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/,
+                                                      const uint8_t* /*src_b*/,
+                                                      int /*count*/) {
+  __asm {
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+  wloop:
+    movdqu     xmm1, [eax]
+    movdqu     xmm2, [eax + edx]
+    lea        eax,  [eax + 16]
+    movdqu     xmm3, xmm1
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqu     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    ja         wloop
+
+    pshufd     xmm1, xmm0, 0EEh
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 01h
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                                    const uint8_t* src_b,
+                                    int count) {
+  uint32_t sse;
+  asm volatile(  // NOLINT
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      "sub       %0,%1                           \n"
+
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movdqu    (%0,%1,1),%%xmm2                \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    %%xmm1,%%xmm3                   \n"
+      "psubusb   %%xmm2,%%xmm1                   \n"
+      "psubusb   %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqu    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpckhbw %%xmm5,%%xmm2                   \n"
+      "pmaddwd   %%xmm1,%%xmm1                   \n"
+      "pmaddwd   %%xmm2,%%xmm2                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "sub       $0x10,%2                        \n"
+      "ja        1b                              \n"
+
+      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,%3                       \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+      :
+      : "memory", "cc"
+#if defined(__SSE2__)
+        ,
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );  // NOLINT
+  return sse;
+}
+#endif  // LIBYUV_DISABLE_X86 etc
+
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile(  // NOLINT
+      "mov %%ebx, %%edi                          \n"
+      "cpuid                                     \n"
+      "xchg %%edi, %%ebx                         \n"
+      : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
+        "=d"(cpu_info[3])
+      : "a"(info_type));
+}
+// For gcc/clang but not clangcl.
+#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile(  // NOLINT
+      "cpuid                                     \n"
+      : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
+        "=d"(cpu_info[3])
+      : "a"(info_type));
+}
+#endif
+
+static int CpuHasSSE2() {
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  if (cpu_info[3] & 0x04000000) {
+    return 1;
+  }
+#endif
+  return 0;
+}
+#endif  // HAS_SUMSQUAREERROR_SSE2
+
+static uint32_t SumSquareError_C(const uint8_t* src_a,
+                                 const uint8_t* src_b,
+                                 int count) {
+  uint32_t sse = 0u;
+  for (int x = 0; x < count; ++x) {
+    int diff = src_a[x] - src_b[x];
+    sse += static_cast<uint32_t>(diff * diff);
+  }
+  return sse;
+}
+
+double ComputeSumSquareError(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  SumSquareError = SumSquareError_NEON;
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (CpuHasSSE2()) {
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+  const int kBlockSize = 1 << 15;
+  uint64_t sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sse)
+#endif
+  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  int remainder = count & (kBlockSize - 1) & ~15;
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 15;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return static_cast<double>(sse);
+}
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size) {
+  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
+  if (sse <= kMINSSE) {
+    sse = kMINSSE;  // Produces max PSNR of 128
+  }
+  return 10.0 * log10(255.0 * 255.0 * size / sse);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/util/psnr.h b/util/psnr.h
new file mode 100644
index 00000000..aac128cb
--- /dev/null
+++ b/util/psnr.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_PSNR_H_  // NOLINT
+#define UTIL_PSNR_H_
+
+#include <math.h>  // For log10()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8_t;
+#define UINT8_TYPE_DEFINED
+#endif
+
+static const double kMaxPSNR = 128.0;
+
+// libyuv provides this function when linking library for jpeg support.
+// TODO(fbarchard): make psnr lib compatible subset of libyuv.
+#if !defined(HAVE_JPEG)
+// Computer Sum of Squared Error (SSE).
+// Pass this to ComputePSNR for final result.
+double ComputeSumSquareError(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // UTIL_PSNR_H_  // NOLINT
diff --git a/util/psnr_main.cc b/util/psnr_main.cc
new file mode 100644
index 00000000..8b9fd972
--- /dev/null
+++ b/util/psnr_main.cc
@@ -0,0 +1,633 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+// To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc
+// or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc
+//
+// To enable OpenMP and SSE2
+// gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc
+// vc:  cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+//
+// Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec]
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "./psnr.h"
+#include "./ssim.h"
+#ifdef HAVE_JPEG
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#endif
+
+struct metric {
+  double y, u, v, all;
+  double min_y, min_u, min_v, min_all;
+  double global_y, global_u, global_v, global_all;
+  int min_frame;
+};
+
+// options
+bool verbose = false;
+bool quiet = false;
+bool show_name = false;
+bool do_swap_uv = false;
+bool do_psnr = false;
+bool do_ssim = false;
+bool do_mse = false;
+bool do_lssim = false;
+int image_width = 0, image_height = 0;
+int fileindex_org = 0;  // argv argument contains the source file name.
+int fileindex_rec = 0;  // argv argument contains the destination file name.
+int num_rec = 0;
+int num_skip_org = 0;
+int num_skip_rec = 0;
+int num_frames = 0;
+#ifdef _OPENMP
+int num_threads = 0;
+#endif
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+                                   int* width_ptr,
+                                   int* height_ptr) {
+  // Isolate the .width_height. section of the filename by searching for a
+  // dot or underscore followed by a digit.
+  for (int i = 0; name[i]; ++i) {
+    if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
+        name[i + 1] <= '9') {
+      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
+      if (2 == n) {
+        return true;
+      }
+    }
+  }
+
+#ifdef HAVE_JPEG
+  // Try parsing file as a jpeg.
+  FILE* const file_org = fopen(name, "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", name);
+    return false;
+  }
+  fseek(file_org, 0, SEEK_END);
+  size_t total_size = ftell(file_org);
+  fseek(file_org, 0, SEEK_SET);
+  uint8_t* const ch_org = new uint8_t[total_size];
+  memset(ch_org, 0, total_size);
+  size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org);
+  fclose(file_org);
+  if (bytes_org == total_size) {
+    if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
+      delete[] ch_org;
+      return true;
+    }
+  }
+  delete[] ch_org;
+#endif  // HAVE_JPEG
+  return false;
+}
+
+// Scale Y channel from 16..240 to 0..255.
+// This can be useful when comparing codecs that are inconsistant about Y
+uint8_t ScaleY(uint8_t y) {
+  int ny = (y - 16) * 256 / 224;
+  if (ny < 0) {
+    ny = 0;
+  }
+  if (ny > 255) {
+    ny = 255;
+  }
+  return static_cast<uint8_t>(ny);
+}
+
+// MSE = Mean Square Error
+double GetMSE(double sse, double size) {
+  return sse / size;
+}
+
+void PrintHelp(const char* program) {
+  printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
+#ifdef HAVE_JPEG
+  printf("jpeg or raw YUV 420 supported.\n");
+#endif
+  printf("options:\n");
+  printf(
+      " -s <width> <height> .... specify YUV size, mandatory if none of the "
+      "sequences have the\n");
+  printf(
+      "                          resolution embedded in their filename (ie. "
+      "name.1920x800_24Hz_P420.yuv)\n");
+  printf(" -psnr .................. compute PSNR (default)\n");
+  printf(" -ssim .................. compute SSIM\n");
+  printf(" -mse ................... compute MSE\n");
+  printf(" -swap .................. Swap U and V plane\n");
+  printf(" -skip <org> <rec> ...... Number of frame to skip of org and rec\n");
+  printf(" -frames <num> .......... Number of frames to compare\n");
+#ifdef _OPENMP
+  printf(" -t <num> ............... Number of threads\n");
+#endif
+  printf(" -n ..................... Show file name\n");
+  printf(" -v ..................... verbose++\n");
+  printf(" -q ..................... quiet\n");
+  printf(" -h ..................... this help\n");
+  exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+  if (argc <= 1) {
+    PrintHelp(argv[0]);
+  }
+  for (int c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-v")) {
+      verbose = true;
+    } else if (!strcmp(argv[c], "-q")) {
+      quiet = true;
+    } else if (!strcmp(argv[c], "-n")) {
+      show_name = true;
+    } else if (!strcmp(argv[c], "-psnr")) {
+      do_psnr = true;
+    } else if (!strcmp(argv[c], "-mse")) {
+      do_mse = true;
+    } else if (!strcmp(argv[c], "-ssim")) {
+      do_ssim = true;
+    } else if (!strcmp(argv[c], "-lssim")) {
+      do_ssim = true;
+      do_lssim = true;
+    } else if (!strcmp(argv[c], "-swap")) {
+      do_swap_uv = true;
+    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      PrintHelp(argv[0]);
+    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+      image_width = atoi(argv[++c]);   // NOLINT
+      image_height = atoi(argv[++c]);  // NOLINT
+    } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
+      num_skip_org = atoi(argv[++c]);  // NOLINT
+      num_skip_rec = atoi(argv[++c]);  // NOLINT
+    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+      num_frames = atoi(argv[++c]);  // NOLINT
+#ifdef _OPENMP
+    } else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
+      num_threads = atoi(argv[++c]);  // NOLINT
+#endif
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Unknown option. %s\n", argv[c]);
+    } else if (fileindex_org == 0) {
+      fileindex_org = c;
+    } else if (fileindex_rec == 0) {
+      fileindex_rec = c;
+      num_rec = 1;
+    } else {
+      ++num_rec;
+    }
+  }
+  if (fileindex_org == 0 || fileindex_rec == 0) {
+    fprintf(stderr, "Missing filenames\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_skip_org < 0 || num_skip_rec < 0) {
+    fprintf(stderr, "Skipped frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_frames < 0) {
+    fprintf(stderr, "Number of frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (image_width == 0 || image_height == 0) {
+    int org_width, org_height;
+    int rec_width, rec_height;
+    bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+                                                       &org_width, &org_height);
+    bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+                                                       &rec_width, &rec_height);
+    if (org_res_avail) {
+      if (rec_res_avail) {
+        if ((org_width == rec_width) && (org_height == rec_height)) {
+          image_width = org_width;
+          image_height = org_height;
+        } else {
+          fprintf(stderr, "Sequences have different resolutions.\n");
+          PrintHelp(argv[0]);
+        }
+      } else {
+        image_width = org_width;
+        image_height = org_height;
+      }
+    } else if (rec_res_avail) {
+      image_width = rec_width;
+      image_height = rec_height;
+    } else {
+      fprintf(stderr, "Missing dimensions.\n");
+      PrintHelp(argv[0]);
+    }
+  }
+}
+
+bool UpdateMetrics(uint8_t* ch_org,
+                   uint8_t* ch_rec,
+                   const int y_size,
+                   const int uv_size,
+                   const size_t total_size,
+                   int number_of_frames,
+                   metric* cur_distortion_psnr,
+                   metric* distorted_frame,
+                   bool compute_psnr) {
+  const int uv_offset = (do_swap_uv ? uv_size : 0);
+  const uint8_t* const u_org = ch_org + y_size + uv_offset;
+  const uint8_t* const u_rec = ch_rec + y_size;
+  const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset);
+  const uint8_t* const v_rec = ch_rec + y_size + uv_size;
+  if (compute_psnr) {
+#ifdef HAVE_JPEG
+    double y_err = static_cast<double>(
+        libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
+    double u_err = static_cast<double>(
+        libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
+    double v_err = static_cast<double>(
+        libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
+#else
+    double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
+    double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
+    double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
+#endif
+    const double total_err = y_err + u_err + v_err;
+    cur_distortion_psnr->global_y += y_err;
+    cur_distortion_psnr->global_u += u_err;
+    cur_distortion_psnr->global_v += v_err;
+    cur_distortion_psnr->global_all += total_err;
+    distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
+    distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
+    distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
+    distorted_frame->all =
+        ComputePSNR(total_err, static_cast<double>(total_size));
+  } else {
+    distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
+    distorted_frame->u =
+        CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2);
+    distorted_frame->v =
+        CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2);
+    distorted_frame->all =
+        (distorted_frame->y + distorted_frame->u + distorted_frame->v) /
+        total_size;
+    distorted_frame->y /= y_size;
+    distorted_frame->u /= uv_size;
+    distorted_frame->v /= uv_size;
+
+    if (do_lssim) {
+      distorted_frame->all = CalcLSSIM(distorted_frame->all);
+      distorted_frame->y = CalcLSSIM(distorted_frame->y);
+      distorted_frame->u = CalcLSSIM(distorted_frame->u);
+      distorted_frame->v = CalcLSSIM(distorted_frame->v);
+    }
+  }
+
+  cur_distortion_psnr->y += distorted_frame->y;
+  cur_distortion_psnr->u += distorted_frame->u;
+  cur_distortion_psnr->v += distorted_frame->v;
+  cur_distortion_psnr->all += distorted_frame->all;
+
+  bool ismin = false;
+  if (distorted_frame->y < cur_distortion_psnr->min_y) {
+    cur_distortion_psnr->min_y = distorted_frame->y;
+  }
+  if (distorted_frame->u < cur_distortion_psnr->min_u) {
+    cur_distortion_psnr->min_u = distorted_frame->u;
+  }
+  if (distorted_frame->v < cur_distortion_psnr->min_v) {
+    cur_distortion_psnr->min_v = distorted_frame->v;
+  }
+  if (distorted_frame->all < cur_distortion_psnr->min_all) {
+    cur_distortion_psnr->min_all = distorted_frame->all;
+    cur_distortion_psnr->min_frame = number_of_frames;
+    ismin = true;
+  }
+  return ismin;
+}
+
+int main(int argc, const char* argv[]) {
+  ParseOptions(argc, argv);
+  if (!do_psnr && !do_ssim) {
+    do_psnr = true;
+  }
+
+#ifdef _OPENMP
+  if (num_threads) {
+    omp_set_num_threads(num_threads);
+  }
+  if (verbose) {
+    printf("OpenMP %d procs\n", omp_get_num_procs());
+  }
+#endif
+  // Open original file (first file argument)
+  FILE* const file_org = fopen(argv[fileindex_org], "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+
+  // Open all files to compare to
+  FILE** file_rec = new FILE*[num_rec];
+  memset(file_rec, 0, num_rec * sizeof(FILE*));  // NOLINT
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
+    if (file_rec[cur_rec] == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+      fclose(file_org);
+      for (int i = 0; i < cur_rec; ++i) {
+        fclose(file_rec[i]);
+      }
+      delete[] file_rec;
+      exit(1);
+    }
+  }
+
+  const int y_size = image_width * image_height;
+  const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
+  const size_t total_size = y_size + 2 * uv_size;  // NOLINT
+#if defined(_MSC_VER)
+  _fseeki64(
+      file_org,
+      static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size),
+      SEEK_SET);
+#else
+  fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+#if defined(_MSC_VER)
+    _fseeki64(
+        file_rec[cur_rec],
+        static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size),
+        SEEK_SET);
+#else
+    fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
+#endif
+  }
+
+  uint8_t* const ch_org = new uint8_t[total_size];
+  uint8_t* const ch_rec = new uint8_t[total_size];
+  if (ch_org == NULL || ch_rec == NULL) {
+    fprintf(stderr, "No memory available\n");
+    fclose(file_org);
+    for (int i = 0; i < num_rec; ++i) {
+      fclose(file_rec[i]);
+    }
+    delete[] ch_org;
+    delete[] ch_rec;
+    delete[] file_rec;
+    exit(1);
+  }
+
+  metric* const distortion_psnr = new metric[num_rec];
+  metric* const distortion_ssim = new metric[num_rec];
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+    cur_distortion_psnr->y = 0.0;
+    cur_distortion_psnr->u = 0.0;
+    cur_distortion_psnr->v = 0.0;
+    cur_distortion_psnr->all = 0.0;
+    cur_distortion_psnr->min_y = kMaxPSNR;
+    cur_distortion_psnr->min_u = kMaxPSNR;
+    cur_distortion_psnr->min_v = kMaxPSNR;
+    cur_distortion_psnr->min_all = kMaxPSNR;
+    cur_distortion_psnr->min_frame = 0;
+    cur_distortion_psnr->global_y = 0.0;
+    cur_distortion_psnr->global_u = 0.0;
+    cur_distortion_psnr->global_v = 0.0;
+    cur_distortion_psnr->global_all = 0.0;
+    distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec];
+  }
+
+  if (verbose) {
+    printf("Size: %dx%d\n", image_width, image_height);
+  }
+
+  if (!quiet) {
+    printf("Frame");
+    if (do_psnr) {
+      printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame");
+    }
+    if (do_ssim) {
+      printf("\t  SSIM-Y\t  SSIM-U\t  SSIM-V\t  SSIM-All\t Frame");
+    }
+    if (show_name) {
+      printf("\tName\n");
+    } else {
+      printf("\n");
+    }
+  }
+
+  int number_of_frames;
+  for (number_of_frames = 0;; ++number_of_frames) {
+    if (num_frames && number_of_frames >= num_frames) {
+      break;
+    }
+
+    size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org);
+    if (bytes_org < total_size) {
+#ifdef HAVE_JPEG
+      // Try parsing file as a jpeg.
+      uint8_t* const ch_jpeg = new uint8_t[bytes_org];
+      memcpy(ch_jpeg, ch_org, bytes_org);
+      memset(ch_org, 0, total_size);
+
+      if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width,
+                                  ch_org + y_size, (image_width + 1) / 2,
+                                  ch_org + y_size + uv_size,
+                                  (image_width + 1) / 2, image_width,
+                                  image_height, image_width, image_height)) {
+        delete[] ch_jpeg;
+        break;
+      }
+      delete[] ch_jpeg;
+#else
+      break;
+#endif  // HAVE_JPEG
+    }
+
+    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+      size_t bytes_rec =
+          fread(ch_rec, sizeof(uint8_t), total_size, file_rec[cur_rec]);
+      if (bytes_rec < total_size) {
+#ifdef HAVE_JPEG
+        // Try parsing file as a jpeg.
+        uint8_t* const ch_jpeg = new uint8_t[bytes_rec];
+        memcpy(ch_jpeg, ch_rec, bytes_rec);
+        memset(ch_rec, 0, total_size);
+
+        if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width,
+                                    ch_rec + y_size, (image_width + 1) / 2,
+                                    ch_rec + y_size + uv_size,
+                                    (image_width + 1) / 2, image_width,
+                                    image_height, image_width, image_height)) {
+          delete[] ch_jpeg;
+          break;
+        }
+        delete[] ch_jpeg;
+#else
+        break;
+#endif  // HAVE_JPEG
+      }
+
+      if (verbose) {
+        printf("%5d", number_of_frames);
+      }
+      if (do_psnr) {
+        metric distorted_frame = {};
+        metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+        bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
+                                   number_of_frames, cur_distortion_psnr,
+                                   &distorted_frame, true);
+        if (verbose) {
+          printf("\t%10.6f", distorted_frame.y);
+          printf("\t%10.6f", distorted_frame.u);
+          printf("\t%10.6f", distorted_frame.v);
+          printf("\t%10.6f", distorted_frame.all);
+          printf("\t%5s", ismin ? "min" : "");
+        }
+      }
+      if (do_ssim) {
+        metric distorted_frame = {};
+        metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+        bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
+                                   number_of_frames, cur_distortion_ssim,
+                                   &distorted_frame, false);
+        if (verbose) {
+          printf("\t%10.6f", distorted_frame.y);
+          printf("\t%10.6f", distorted_frame.u);
+          printf("\t%10.6f", distorted_frame.v);
+          printf("\t%10.6f", distorted_frame.all);
+          printf("\t%5s", ismin ? "min" : "");
+        }
+      }
+      if (verbose) {
+        if (show_name) {
+          printf("\t%s", argv[fileindex_rec + cur_rec]);
+        }
+        printf("\n");
+      }
+    }
+  }
+
+  // Final PSNR computation.
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+    metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+    if (number_of_frames > 0) {
+      const double norm = 1. / static_cast<double>(number_of_frames);
+      cur_distortion_psnr->y *= norm;
+      cur_distortion_psnr->u *= norm;
+      cur_distortion_psnr->v *= norm;
+      cur_distortion_psnr->all *= norm;
+      cur_distortion_ssim->y *= norm;
+      cur_distortion_ssim->u *= norm;
+      cur_distortion_ssim->v *= norm;
+      cur_distortion_ssim->all *= norm;
+    }
+
+    if (do_psnr) {
+      const double global_psnr_y =
+          ComputePSNR(cur_distortion_psnr->global_y,
+                      static_cast<double>(y_size) * number_of_frames);
+      const double global_psnr_u =
+          ComputePSNR(cur_distortion_psnr->global_u,
+                      static_cast<double>(uv_size) * number_of_frames);
+      const double global_psnr_v =
+          ComputePSNR(cur_distortion_psnr->global_v,
+                      static_cast<double>(uv_size) * number_of_frames);
+      const double global_psnr_all =
+          ComputePSNR(cur_distortion_psnr->global_all,
+                      static_cast<double>(total_size) * number_of_frames);
+      printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y,
+             global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames);
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+
+    if (!quiet) {
+      printf("Avg:");
+      if (do_psnr) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y,
+               cur_distortion_psnr->u, cur_distortion_psnr->v,
+               cur_distortion_psnr->all, number_of_frames);
+      }
+      if (do_ssim) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y,
+               cur_distortion_ssim->u, cur_distortion_ssim->v,
+               cur_distortion_ssim->all, number_of_frames);
+      }
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+    if (!quiet) {
+      printf("Min:");
+      if (do_psnr) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+               cur_distortion_psnr->min_y, cur_distortion_psnr->min_u,
+               cur_distortion_psnr->min_v, cur_distortion_psnr->min_all,
+               cur_distortion_psnr->min_frame);
+      }
+      if (do_ssim) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+               cur_distortion_ssim->min_y, cur_distortion_ssim->min_u,
+               cur_distortion_ssim->min_v, cur_distortion_ssim->min_all,
+               cur_distortion_ssim->min_frame);
+      }
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+
+    if (do_mse) {
+      double global_mse_y =
+          GetMSE(cur_distortion_psnr->global_y,
+                 static_cast<double>(y_size) * number_of_frames);
+      double global_mse_u =
+          GetMSE(cur_distortion_psnr->global_u,
+                 static_cast<double>(uv_size) * number_of_frames);
+      double global_mse_v =
+          GetMSE(cur_distortion_psnr->global_v,
+                 static_cast<double>(uv_size) * number_of_frames);
+      double global_mse_all =
+          GetMSE(cur_distortion_psnr->global_all,
+                 static_cast<double>(total_size) * number_of_frames);
+      printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y,
+             global_mse_u, global_mse_v, global_mse_all, number_of_frames);
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+  }
+  fclose(file_org);
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    fclose(file_rec[cur_rec]);
+  }
+  delete[] distortion_psnr;
+  delete[] distortion_ssim;
+  delete[] ch_org;
+  delete[] ch_rec;
+  delete[] file_rec;
+  return 0;
+}
diff --git a/util/ssim.cc b/util/ssim.cc
new file mode 100644
index 00000000..096fbcf0
--- /dev/null
+++ b/util/ssim.cc
@@ -0,0 +1,364 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../util/ssim.h"  // NOLINT
+
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32_t;    // NOLINT
+typedef unsigned short uint16_t;  // NOLINT
+
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
+    (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
+#define __SSE2__
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+// SSIM
+enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
+
+// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
+// The maximum value (11 x 11) must be less than 128 to avoid sign
+// problems during the calls to _mm_mullo_epi16().
+static const int K[KERNEL_SIZE] = {
+    1, 3, 7, 11, 7, 3, 1  // ~11 * exp(-0.3 * i * i)
+};
+static const double kiW[KERNEL + 1 + 1] = {
+    1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
+    1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
+    1. / 1056.,  // 1 / sum(i:0..5, j..6) K[i]*K[j]
+    1. / 957.,   // 1 / sum(i:0..4, j..6) K[i]*K[j]
+    1. / 726.,   // 1 / sum(i:0..3, j..6) K[i]*K[j]
+};
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+
+#define PWEIGHT(A, B) static_cast<uint16_t>(K[(A)] * K[(B)])  // weight product
+#define MAKE_WEIGHT(L)                                                \
+  {                                                                   \
+    {                                                                 \
+      {                                                               \
+        PWEIGHT(L, 0)                                                 \
+        , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
+            PWEIGHT(L, 5), PWEIGHT(L, 6), 0                           \
+      }                                                               \
+    }                                                                 \
+  }
+
+// We need this union trick to be able to initialize constant static __m128i
+// values. We can't call _mm_set_epi16() for static compile-time initialization.
+static const struct {
+  union {
+    uint16_t i16_[8];
+    __m128i m_;
+  } values_;
+} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
+  W3 = MAKE_WEIGHT(3);
+// ... the rest is symmetric.
+#undef MAKE_WEIGHT
+#undef PWEIGHT
+#endif
+
+// Common final expression for SSIM, once the weighted sums are known.
+static double FinalizeSSIM(double iw,
+                           double xm,
+                           double ym,
+                           double xxm,
+                           double xym,
+                           double yym) {
+  const double iwx = xm * iw;
+  const double iwy = ym * iw;
+  double sxx = xxm * iw - iwx * iwx;
+  double syy = yym * iw - iwy * iwy;
+  // small errors are possible, due to rounding. Clamp to zero.
+  if (sxx < 0.) {
+    sxx = 0.;
+  }
+  if (syy < 0.) {
+    syy = 0.;
+  }
+  const double sxsy = sqrt(sxx * syy);
+  const double sxy = xym * iw - iwx * iwy;
+  static const double C11 = (0.01 * 0.01) * (255 * 255);
+  static const double C22 = (0.03 * 0.03) * (255 * 255);
+  static const double C33 = (0.015 * 0.015) * (255 * 255);
+  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+  const double c = (2. * sxsy + C22) / (sxx + syy + C22);
+  const double s = (sxy + C33) / (sxsy + C33);
+  return l * c * s;
+}
+
+// GetSSIM() does clipping.  GetSSIMFullKernel() does not
+
+// TODO(skal): use summed tables?
+// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
+// with a diff of 255, squared. The maximum error is thus 0x4388241,
+// which fits into 32 bits integers.
+double GetSSIM(const uint8_t* org,
+               const uint8_t* rec,
+               int xo,
+               int yo,
+               int W,
+               int H,
+               int stride) {
+  uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+  org += (yo - KERNEL) * stride;
+  org += (xo - KERNEL);
+  rec += (yo - KERNEL) * stride;
+  rec += (xo - KERNEL);
+  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
+    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) {
+      continue;
+    }
+    const int Wy = K[y_];
+    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
+      const int Wxy = Wy * K[x_];
+      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
+        const int org_x = org[x_];
+        const int rec_x = rec[x_];
+        ws += Wxy;
+        xm += Wxy * org_x;
+        ym += Wxy * rec_x;
+        xxm += Wxy * org_x * org_x;
+        xym += Wxy * org_x * rec_x;
+        yym += Wxy * rec_x * rec_x;
+      }
+    }
+  }
+  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
+}
+
+double GetSSIMFullKernel(const uint8_t* org,
+                         const uint8_t* rec,
+                         int xo,
+                         int yo,
+                         int stride,
+                         double area_weight) {
+  uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+
+#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
+
+  org += yo * stride + xo;
+  rec += yo * stride + xo;
+  for (int y = 1; y <= KERNEL; y++) {
+    const int dy1 = y * stride;
+    const int dy2 = y * stride;
+    const int Wy = K[KERNEL + y];
+
+    for (int x = 1; x <= KERNEL; x++) {
+      // Compute the contributions of upper-left (ul), upper-right (ur)
+      // lower-left (ll) and lower-right (lr) points (see the diagram below).
+      // Symmetric Kernel will have same weight on those points.
+      //       -  -  -  -  -  -  -
+      //       -  ul -  -  -  ur -
+      //       -  -  -  -  -  -  -
+      //       -  -  -  0  -  -  -
+      //       -  -  -  -  -  -  -
+      //       -  ll -  -  -  lr -
+      //       -  -  -  -  -  -  -
+      const int Wxy = Wy * K[KERNEL + x];
+      const int ul1 = org[-dy1 - x];
+      const int ur1 = org[-dy1 + x];
+      const int ll1 = org[dy1 - x];
+      const int lr1 = org[dy1 + x];
+
+      const int ul2 = rec[-dy2 - x];
+      const int ur2 = rec[-dy2 + x];
+      const int ll2 = rec[dy2 - x];
+      const int lr2 = rec[dy2 + x];
+
+      xm += Wxy * (ul1 + ur1 + ll1 + lr1);
+      ym += Wxy * (ul2 + ur2 + ll2 + lr2);
+      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
+      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
+      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
+    }
+
+    // Compute the contributions of up (u), down (d), left (l) and right (r)
+    // points across the main axes (see the diagram below).
+    // Symmetric Kernel will have same weight on those points.
+    //       -  -  -  -  -  -  -
+    //       -  -  -  u  -  -  -
+    //       -  -  -  -  -  -  -
+    //       -  l  -  0  -  r  -
+    //       -  -  -  -  -  -  -
+    //       -  -  -  d  -  -  -
+    //       -  -  -  -  -  -  -
+    const int Wxy = Wy * K[KERNEL];
+    const int u1 = org[-dy1];
+    const int d1 = org[dy1];
+    const int l1 = org[-y];
+    const int r1 = org[y];
+
+    const int u2 = rec[-dy2];
+    const int d2 = rec[dy2];
+    const int l2 = rec[-y];
+    const int r2 = rec[y];
+
+    xm += Wxy * (u1 + d1 + l1 + r1);
+    ym += Wxy * (u2 + d2 + l2 + r2);
+    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
+    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
+    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
+  }
+
+  // Lastly the contribution of (x0, y0) point.
+  const int Wxy = K[KERNEL] * K[KERNEL];
+  const int s1 = org[0];
+  const int s2 = rec[0];
+
+  xm += Wxy * s1;
+  ym += Wxy * s2;
+  xxm += Wxy * s1 * s1;
+  xym += Wxy * s1 * s2;
+  yym += Wxy * s2 * s2;
+
+#else  // __SSE2__
+
+  org += (yo - KERNEL) * stride + (xo - KERNEL);
+  rec += (yo - KERNEL) * stride + (xo - KERNEL);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = zero;
+  __m128i y = zero;
+  __m128i xx = zero;
+  __m128i xy = zero;
+  __m128i yy = zero;
+
+// Read 8 pixels at line #L, and convert to 16bit, perform weighting
+// and acccumulate.
+#define LOAD_LINE_PAIR(L, WEIGHT)                                            \
+  do {                                                                       \
+    const __m128i v0 =                                                       \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
+    const __m128i v1 =                                                       \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
+    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
+    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
+    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
+    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
+    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
+    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
+    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
+    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
+    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
+    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
+    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
+  } while (0)
+
+#define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
+  do {                                                      \
+    uint32_t tmp[4];                                        \
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
+    (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
+  } while (0)
+
+  LOAD_LINE_PAIR(0, W0);
+  LOAD_LINE_PAIR(1, W1);
+  LOAD_LINE_PAIR(2, W2);
+  LOAD_LINE_PAIR(3, W3);
+  LOAD_LINE_PAIR(4, W2);
+  LOAD_LINE_PAIR(5, W1);
+  LOAD_LINE_PAIR(6, W0);
+
+  ADD_AND_STORE_FOUR_EPI32(x, xm);
+  ADD_AND_STORE_FOUR_EPI32(y, ym);
+  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
+  ADD_AND_STORE_FOUR_EPI32(xy, xym);
+  ADD_AND_STORE_FOUR_EPI32(yy, yym);
+
+#undef LOAD_LINE_PAIR
+#undef ADD_AND_STORE_FOUR_EPI32
+#endif
+
+  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
+}
+
+static int start_max(int x, int y) {
+  return (x > y) ? x : y;
+}
+
+double CalcSSIM(const uint8_t* org,
+                const uint8_t* rec,
+                const int image_width,
+                const int image_height) {
+  double SSIM = 0.;
+  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
+  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
+  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
+  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
+  const int stride = image_width;
+
+  for (int j = 0; j < KERNEL_Y; ++j) {
+    for (int i = 0; i < image_width; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+  }
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : SSIM)
+#endif
+  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
+    for (int i = 0; i < KERNEL_X; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+    for (int i = KERNEL_X; i < start_x; ++i) {
+      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
+    }
+    if (start_x < image_width) {
+      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
+      // copy the 8 rightmost pixels on a cache area, and pad this area with
+      // zeros which won't contribute to the overall SSIM value (but we need
+      // to pass the correct normalizing constant!). By using this cache, we can
+      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
+      // NOTE: we could use similar method for the left-most pixels too.
+      const int kScratchWidth = 8;
+      const int kScratchStride = kScratchWidth + KERNEL + 1;
+      uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0};
+      uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
+
+      for (int k = 0; k < KERNEL_SIZE; ++k) {
+        const int offset =
+            (j - KERNEL + k) * stride + image_width - kScratchWidth;
+        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
+        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
+      }
+      for (int k = 0; k <= KERNEL_X + 1; ++k) {
+        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
+                                  kScratchStride, kiW[k]);
+      }
+    }
+  }
+
+  for (int j = start_y; j < image_height; ++j) {
+    for (int i = 0; i < image_width; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+  }
+  return SSIM;
+}
+
+double CalcLSSIM(double ssim) {
+  return -10.0 * log10(1.0 - ssim);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/util/ssim.h b/util/ssim.h
new file mode 100644
index 00000000..a855f1d1
--- /dev/null
+++ b/util/ssim.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_SSIM_H_
+#define UTIL_SSIM_H_
+
+#include <math.h>  // For log10()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8_t;
+#define UINT8_TYPE_DEFINED
+#endif
+
+double CalcSSIM(const uint8_t* org,
+                const uint8_t* rec,
+                const int image_width,
+                const int image_height);
+
+double CalcLSSIM(double ssim);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // UTIL_SSIM_H_
diff --git a/util/yuvconstants.c b/util/yuvconstants.c
new file mode 100644
index 00000000..4e5185af
--- /dev/null
+++ b/util/yuvconstants.c
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// See Also
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126
+//  KB = 0.0722
+
+// // Y contribution to R,G,B.  Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22  /* round(0.34414 * 64) */
+// #define VG 46  /* round(0.71414 * 64) */
+// #define VR 90  /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int main(int argc, const char* argv[]) {
+  if (argc < 3) {
+    printf("yuvconstants [KR] [KB]\n");
+    printf("  e.g. yuvconstants 0.2126 0.0722\n");
+    printf("  MC BT          KR           KB\n");
+    printf("  1  BT.709      KR = 0.2126; KB = 0.0722\n");
+    printf("  4  FCC         KR = 0.30;   KB = 0.11\n");
+    printf("  6  BT.601      KR = 0.299;  KB = 0.114\n");
+    printf("  7  SMPTE 240M  KR = 0.212;  KB = 0.087\n");
+    printf("  9  BT.2020     KR = 0.2627; KB = 0.0593\n");
+    return -1;
+  }
+  float kr = (float)atof(argv[1]);
+  float kb = (float)atof(argv[2]);
+  float kg = 1 - kr - kb;
+
+  float vr = 2 * (1 - kr);
+  float ug = 2 * ((1 - kb) * kb / kg);
+  float vg = 2 * ((1 - kr) * kr / kg);
+  float ub = 2 * (1 - kb);
+
+  printf("Full range\n");
+  printf("R = Y                + V * %5f\n", vr);
+  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+  printf("B = Y + U * %5f\n", ub);
+
+  printf("KR = %4f; ", kr);
+  printf("KB = %4f\n", kb);
+  // printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
+  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
+  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
+  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+
+  vr = 255.f / 224.f * 2 * (1 - kr);
+  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+  ub = 255.f / 224.f * 2 * (1 - kb);
+
+  printf("\nLimited range\n");
+  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
+  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+  // printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
+  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
+  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
+  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+
+  return 0;
+}
diff --git a/util/yuvconvert.cc b/util/yuvconvert.cc
new file mode 100644
index 00000000..93b52668
--- /dev/null
+++ b/util/yuvconvert.cc
@@ -0,0 +1,367 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Convert an ARGB image to YUV.
+// Usage: yuvconvert src_argb.raw dst_yuv.raw
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/scale_argb.h"
+
+// options
+bool verbose = false;
+bool attenuate = false;
+bool unattenuate = false;
+int image_width = 0, image_height = 0;  // original width and height
+int dst_width = 0, dst_height = 0;      // new width and height
+int fileindex_org = 0;  // argv argument contains the original file name.
+int fileindex_rec = 0;  // argv argument contains the reconstructed file name.
+int num_rec = 0;        // Number of reconstructed images.
+int num_skip_org = 0;   // Number of frames to skip in original.
+int num_frames = 0;     // Number of frames to convert.
+int filter = 1;         // Bilinear filter for scaling.
+
+static __inline uint32_t Abs(int32_t v) {
+  return v >= 0 ? v : -v;
+}
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+static bool ExtractResolutionFromFilename(const char* name,
+                                          int* width_ptr,
+                                          int* height_ptr) {
+  // Isolate the .width_height. section of the filename by searching for a
+  // dot or underscore followed by a digit.
+  for (int i = 0; name[i]; ++i) {
+    if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
+        name[i + 1] <= '9') {
+      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
+      if (2 == n) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+static void PrintHelp(const char* program) {
+  printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
+  printf(
+      " -s <width> <height> .... specify source resolution.  "
+      "Optional if name contains\n"
+      "                          resolution (ie. "
+      "name.1920x800_24Hz_P420.yuv)\n"
+      "                          Negative value mirrors.\n");
+  printf(" -d <width> <height> .... specify destination resolution.\n");
+  printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
+  printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
+  printf(" -frames <num> .......... Number of frames to convert\n");
+  printf(" -attenuate ............. Attenuate the ARGB image\n");
+  printf(" -unattenuate ........... Unattenuate the ARGB image\n");
+  printf(" -v ..................... verbose\n");
+  printf(" -h ..................... this help\n");
+  exit(0);
+}
+
+static void ParseOptions(int argc, const char* argv[]) {
+  if (argc <= 1) {
+    PrintHelp(argv[0]);
+  }
+  for (int c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-v")) {
+      verbose = true;
+    } else if (!strcmp(argv[c], "-attenuate")) {
+      attenuate = true;
+    } else if (!strcmp(argv[c], "-unattenuate")) {
+      unattenuate = true;
+    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      PrintHelp(argv[0]);
+    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+      image_width = atoi(argv[++c]);   // NOLINT
+      image_height = atoi(argv[++c]);  // NOLINT
+    } else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
+      dst_width = atoi(argv[++c]);   // NOLINT
+      dst_height = atoi(argv[++c]);  // NOLINT
+    } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
+      num_skip_org = atoi(argv[++c]);  // NOLINT
+    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+      num_frames = atoi(argv[++c]);  // NOLINT
+    } else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
+      filter = atoi(argv[++c]);  // NOLINT
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Unknown option. %s\n", argv[c]);
+    } else if (fileindex_org == 0) {
+      fileindex_org = c;
+    } else if (fileindex_rec == 0) {
+      fileindex_rec = c;
+      num_rec = 1;
+    } else {
+      ++num_rec;
+    }
+  }
+  if (fileindex_org == 0 || fileindex_rec == 0) {
+    fprintf(stderr, "Missing filenames\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_skip_org < 0) {
+    fprintf(stderr, "Skipped frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_frames < 0) {
+    fprintf(stderr, "Number of frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+
+  int org_width, org_height;
+  int rec_width, rec_height;
+  bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+                                                     &org_width, &org_height);
+  bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+                                                     &rec_width, &rec_height);
+  if (image_width == 0 || image_height == 0) {
+    if (org_res_avail) {
+      image_width = org_width;
+      image_height = org_height;
+    } else if (rec_res_avail) {
+      image_width = rec_width;
+      image_height = rec_height;
+    } else {
+      fprintf(stderr, "Missing dimensions.\n");
+      PrintHelp(argv[0]);
+    }
+  }
+  if (dst_width == 0 || dst_height == 0) {
+    if (rec_res_avail) {
+      dst_width = rec_width;
+      dst_height = rec_height;
+    } else {
+      dst_width = Abs(image_width);
+      dst_height = Abs(image_height);
+    }
+  }
+}
+
+static const int kTileX = 32;
+static const int kTileY = 32;
+
+static int TileARGBScale(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         int src_width,
+                         int src_height,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int destination_width,
+                         int destination_height,
+                         libyuv::FilterMode filtering) {
+  for (int y = 0; y < destination_height; y += kTileY) {
+    for (int x = 0; x < destination_width; x += kTileX) {
+      int clip_width = kTileX;
+      if (x + clip_width > destination_width) {
+        clip_width = destination_width - x;
+      }
+      int clip_height = kTileY;
+      if (y + clip_height > destination_height) {
+        clip_height = destination_height - y;
+      }
+      int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width,
+                                    src_height, dst_argb, dst_stride_argb,
+                                    destination_width, destination_height, x, y,
+                                    clip_width, clip_height, filtering);
+      if (r) {
+        return r;
+      }
+    }
+  }
+  return 0;
+}
+
+int main(int argc, const char* argv[]) {
+  ParseOptions(argc, argv);
+
+  // Open original file (first file argument)
+  FILE* const file_org = fopen(argv[fileindex_org], "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+
+  // Open all files to convert to
+  FILE** file_rec = new FILE*[num_rec];
+  memset(file_rec, 0, num_rec * sizeof(FILE*));  // NOLINT
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
+    if (file_rec[cur_rec] == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+      fclose(file_org);
+      for (int i = 0; i < cur_rec; ++i) {
+        fclose(file_rec[i]);
+      }
+      delete[] file_rec;
+      exit(1);
+    }
+  }
+
+  bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL;
+  bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL;
+  if (!org_is_yuv && !org_is_argb) {
+    fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+  int org_size = Abs(image_width) * Abs(image_height) * 4;  // ARGB
+  // Input is YUV
+  if (org_is_yuv) {
+    const int y_size = Abs(image_width) * Abs(image_height);
+    const int uv_size =
+        ((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2);
+    org_size = y_size + 2 * uv_size;  // YUV original.
+  }
+
+  const int dst_size = dst_width * dst_height * 4;  // ARGB scaled
+  const int y_size = dst_width * dst_height;
+  const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  const size_t total_size = y_size + 2 * uv_size;
+#if defined(_MSC_VER)
+  _fseeki64(file_org,
+            static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size),
+            SEEK_SET);
+#else
+  fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+
+  uint8_t* const ch_org = new uint8_t[org_size];
+  uint8_t* const ch_dst = new uint8_t[dst_size];
+  uint8_t* const ch_rec = new uint8_t[total_size];
+  if (ch_org == NULL || ch_rec == NULL) {
+    fprintf(stderr, "No memory available\n");
+    fclose(file_org);
+    for (int i = 0; i < num_rec; ++i) {
+      fclose(file_rec[i]);
+    }
+    delete[] ch_org;
+    delete[] ch_dst;
+    delete[] ch_rec;
+    delete[] file_rec;
+    exit(1);
+  }
+
+  if (verbose) {
+    printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width,
+           dst_height);
+  }
+
+  int number_of_frames;
+  for (number_of_frames = 0;; ++number_of_frames) {
+    if (num_frames && number_of_frames >= num_frames) {
+      break;
+    }
+
+    // Load original YUV or ARGB frame.
+    size_t bytes_org =
+        fread(ch_org, sizeof(uint8_t), static_cast<size_t>(org_size), file_org);
+    if (bytes_org < static_cast<size_t>(org_size)) {
+      break;
+    }
+
+    // TODO(fbarchard): Attenuate doesnt need to know dimensions.
+    // ARGB attenuate frame
+    if (org_is_argb && attenuate) {
+      libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+    }
+    // ARGB unattenuate frame
+    if (org_is_argb && unattenuate) {
+      libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+    }
+
+    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+      // Scale YUV or ARGB frame.
+      if (org_is_yuv) {
+        int src_width = Abs(image_width);
+        int src_height = Abs(image_height);
+        int half_src_width = (src_width + 1) / 2;
+        int half_src_height = (src_height + 1) / 2;
+        int half_dst_width = (dst_width + 1) / 2;
+        int half_dst_height = (dst_height + 1) / 2;
+        I420Scale(
+            ch_org, src_width, ch_org + src_width * src_height, half_src_width,
+            ch_org + src_width * src_height + half_src_width * half_src_height,
+            half_src_width, image_width, image_height, ch_rec, dst_width,
+            ch_rec + dst_width * dst_height, half_dst_width,
+            ch_rec + dst_width * dst_height + half_dst_width * half_dst_height,
+            half_dst_width, dst_width, dst_height,
+            static_cast<libyuv::FilterMode>(filter));
+      } else {
+        TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height,
+                      ch_dst, dst_width * 4, dst_width, dst_height,
+                      static_cast<libyuv::FilterMode>(filter));
+      }
+      bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
+      bool rec_is_argb =
+          strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL;
+      if (!rec_is_yuv && !rec_is_argb) {
+        fprintf(stderr, "Output format unknown %s\n",
+                argv[fileindex_rec + cur_rec]);
+        continue;  // Advance to next file.
+      }
+
+      // Convert ARGB to YUV.
+      if (!org_is_yuv && rec_is_yuv) {
+        int half_width = (dst_width + 1) / 2;
+        int half_height = (dst_height + 1) / 2;
+        libyuv::ARGBToI420(
+            ch_dst, dst_width * 4, ch_rec, dst_width,
+            ch_rec + dst_width * dst_height, half_width,
+            ch_rec + dst_width * dst_height + half_width * half_height,
+            half_width, dst_width, dst_height);
+      }
+
+      // Output YUV or ARGB frame.
+      if (rec_is_yuv) {
+        size_t bytes_rec =
+            fwrite(ch_rec, sizeof(uint8_t), static_cast<size_t>(total_size),
+                   file_rec[cur_rec]);
+        if (bytes_rec < static_cast<size_t>(total_size)) {
+          break;
+        }
+      } else {
+        size_t bytes_rec =
+            fwrite(ch_dst, sizeof(uint8_t), static_cast<size_t>(dst_size),
+                   file_rec[cur_rec]);
+        if (bytes_rec < static_cast<size_t>(dst_size)) {
+          break;
+        }
+      }
+      if (verbose) {
+        printf("%5d", number_of_frames);
+      }
+      if (verbose) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+        printf("\n");
+      }
+    }
+  }
+
+  fclose(file_org);
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    fclose(file_rec[cur_rec]);
+  }
+  delete[] ch_org;
+  delete[] ch_dst;
+  delete[] ch_rec;
+  delete[] file_rec;
+  return 0;
+}
-- 
cgit v1.2.3