11 files changed, 534 insertions, 125 deletions
diff --git a/ChangeLog b/ChangeLog
index 42720ae9..3c592631 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,7 +2,7 @@ Each utility has its own version number, date of last change and
 some description at the top of its ".c" file. All utilities in the main
 directory have their own "man" pages. There is also a sg3_utils man page.
 
-Changelog for sg3_utils-1.43 [20180309] [svn: r761]
+Changelog for sg3_utils-1.43 [20180316] [svn: r762]
   - sg_write_x: where x can be normal, atomic, or(write),
     same, scattered, or stream writes with 16 or 32 byte
     cdbs (sbc4r04 for atomic, sbc4r11 for scattered)
diff --git a/config.h.in b/config.h.in
index d575de9c..ab0eb007 100644
--- a/config.h.in
+++ b/config.h.in
@@ -1,5 +1,8 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
+/* Define to 1 if you have the <byteswap.h> header file. */
+#undef HAVE_BYTESWAP_H
+
 /* Define to 1 if you have the `clock_gettime' function. */
 #undef HAVE_CLOCK_GETTIME
 
@@ -72,6 +75,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
+/* use generic little-endian/big-endian instead */
+#undef IGNORE_FAST_LEBE
+
 /* option ignored */
 #undef IGNORE_LINUX_BSG
 
diff --git a/configure b/configure
index 40220754..9d1c1a4b 100755
--- a/configure
+++ b/configure
@@ -787,6 +787,7 @@ enable_linuxbsg
 enable_win32_spt_direct
 enable_scsistrings
 enable_nvme_supp
+enable_fast_lebe
 '
       ac_precious_vars='build_alias
 host_alias
@@ -1447,6 +1448,7 @@ Optional Features:
   --disable-scsistrings   Disable full SCSI sense strings and NVMe status
                           strings
   --disable-nvme-supp     remove all or most NVMe code
+  --disable-fast-lebe     use generic little-endian/big-endian code instead
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -12301,6 +12303,18 @@ $as_echo "#define STDC_HEADERS 1" >>confdefs.h
 
 fi
 
+for ac_header in byteswap.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "byteswap.h" "ac_cv_header_byteswap_h" "$ac_includes_default"
+if test "x$ac_cv_header_byteswap_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_BYTESWAP_H 1
+_ACEOF
+
+fi
+
+done
+
 
 # check for functions
 for ac_func in getopt_long
@@ -12745,6 +12759,16 @@ _ACEOF
 fi
 
 
+# Check whether --enable-fast-lebe was given.
+if test "${enable_fast_lebe+set}" = set; then :
+  enableval=$enable_fast_lebe;
+cat >>confdefs.h <<_ACEOF
+#define IGNORE_FAST_LEBE 1
+_ACEOF
+
+fi
+
+
 
 ac_config_files="$ac_config_files Makefile include/Makefile lib/Makefile src/Makefile doc/Makefile scripts/Makefile"
 
diff --git a/configure.ac b/configure.ac
index 354a0220..757a2c15 100644
--- a/configure.ac
+++ b/configure.ac
@@ -16,6 +16,7 @@ AC_PROG_LIBTOOL
 
 # check for headers
 AC_HEADER_STDC
+AC_CHECK_HEADERS([byteswap.h], [], [], [])
 
 # check for functions
 AC_CHECK_FUNCS(getopt_long,
@@ -114,6 +115,10 @@ AC_ARG_ENABLE([nvme-supp],
   AC_HELP_STRING([--disable-nvme-supp], [remove all or most NVMe code]),
   [AC_DEFINE_UNQUOTED(IGNORE_NVME, 1, [compile out NVMe support], )], [])
 
+AC_ARG_ENABLE([fast-lebe],
+  AC_HELP_STRING([--disable-fast-lebe], [use generic little-endian/big-endian code instead]),
+  [AC_DEFINE_UNQUOTED(IGNORE_FAST_LEBE, 1, [use generic little-endian/big-endian instead], )], [])
+
 
 AC_OUTPUT(Makefile include/Makefile lib/Makefile src/Makefile doc/Makefile scripts/Makefile)
 
diff --git a/include/Makefile.am b/include/Makefile.am
index 3dc1ef3b..64c27b43 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -8,6 +8,8 @@ scsiinclude_HEADERS = \
 	sg_cmds_basic.h \
 	sg_cmds_extra.h \
 	sg_cmds_mmc.h \
+	sg_pr2serr.h \
+	sg_unaligned.h \
 	sg_pt.h \
 	sg_pt_nvme.h
 
diff --git a/include/Makefile.in b/include/Makefile.in
index a16e29ec..f12229b1 100644
--- a/include/Makefile.in
+++ b/include/Makefile.in
@@ -127,9 +127,9 @@ am__can_run_installinfo = \
   esac
 am__noinst_HEADERS_DIST = sg_linux_inc.h sg_io_linux.h sg_pt_win32.h
 am__scsiinclude_HEADERS_DIST = sg_lib.h sg_lib_data.h sg_cmds.h \
-	sg_cmds_basic.h sg_cmds_extra.h sg_cmds_mmc.h sg_pt.h \
-	sg_pt_nvme.h sg_linux_inc.h sg_io_linux.h sg_pt_linux.h \
-	sg_pt_win32.h
+	sg_cmds_basic.h sg_cmds_extra.h sg_cmds_mmc.h sg_pr2serr.h \
+	sg_unaligned.h sg_pt.h sg_pt_nvme.h sg_linux_inc.h \
+	sg_io_linux.h sg_pt_linux.h sg_pt_win32.h
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
 am__vpath_adj = case $$p in \
     $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
@@ -302,8 +302,9 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 scsiincludedir = $(includedir)/scsi
 scsiinclude_HEADERS = sg_lib.h sg_lib_data.h sg_cmds.h sg_cmds_basic.h \
-	sg_cmds_extra.h sg_cmds_mmc.h sg_pt.h sg_pt_nvme.h \
-	$(am__append_1) $(am__append_2) $(am__append_3)
+	sg_cmds_extra.h sg_cmds_mmc.h sg_pr2serr.h sg_unaligned.h \
+	sg_pt.h sg_pt_nvme.h $(am__append_1) $(am__append_2) \
+	$(am__append_3)
 @OS_FREEBSD_TRUE@noinst_HEADERS = \
 @OS_FREEBSD_TRUE@	sg_linux_inc.h \
 @OS_FREEBSD_TRUE@	sg_io_linux.h \
diff --git a/include/sg_unaligned.h b/include/sg_unaligned.h
index 0a5c92f1..b6f4698d 100644
--- a/include/sg_unaligned.h
+++ b/include/sg_unaligned.h
@@ -8,173 +8,279 @@
  * license that can be found in the BSD_LICENSE file.
  */
 
-#include <stdint.h>
+#include <stdbool.h>
+#include <stdint.h>     /* for uint8_t and friends */
+#include <string.h>     /* for memcpy */
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* Borrowed from the Linux kernel, via mhvtl */
+/* These inline functions convert integers (always unsigned) to byte streams
+ * and vice versa. They have two goals:
+ *   - change the byte ordering of integers between host order and big
+ *     endian ("_be") or little endian ("_le")
+ *   - copy the big or little endian byte stream so it complies with any
+ *     alignment that host integers require
+ *
+ * Host integer to given endian byte stream is a "_put_" function taking
+ * two arguments (integer and pointer to byte stream) returning void.
+ * Given endian byte stream to host integer is a "_get_" function that takes
+ * one argument and returns an integer of appropriate size (uint32_t for 24
+ * bit operations, uint64_t for 48 bit operations).
+ *
+ * Big endian byte format "on the wire" is the default used by SCSI
+ * standards (www.t10.org). Big endian is also the network byte order.
+ * Little endian is used by ATA, PCI and NVMe.
+ */
+
+/* The generic form of these routines was borrowed from the Linux kernel,
+ * via mhvtl. There is a specialised version of the main functions for
+ * little endian or big endian provided that not-quite-standard defines for
+ * endianness are available from the compiler and the <byteswap.h> header
+ * (a GNU extension) has been detected by ./configure . To force the
+ * generic version, use './configure --disable-fast-lebe ' . */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"     /* need this to see if HAVE_BYTESWAP_H */
+#endif
+
+#undef GOT_UNALIGNED_SPECIALS   /* just in case */
 
-/* In the first section below, functions that copy unsigned integers in a
- * computer's native format, to and from an unaligned big endian sequence of
- * bytes. Big endian byte format "on the wire" is the default used by SCSI
- * standards (www.t10.org). Big endian is also the network byte order. */
+#if defined(__BYTE_ORDER__) && defined(HAVE_BYTESWAP_H) && \
+    ! defined(IGNORE_FAST_LEBE)
+
+#if defined(__LITTLE_ENDIAN__) || (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+
+#define GOT_UNALIGNED_SPECIALS 1
+
+#include <byteswap.h>           /* for bswap_16(), bswap_32() and bswap_64() */
+
+// #warning ">>>>>> Doing Little endian special unaligneds"
 
 static inline uint16_t __get_unaligned_be16(const uint8_t *p)
 {
-        return p[0] << 8 | p[1];
+        uint16_t u;
+
+        memcpy(&u, p, 2);
+        return bswap_16(u);
 }
 
 static inline uint32_t __get_unaligned_be32(const uint8_t *p)
 {
-        return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
-}
+        uint32_t u;
 
-/* Assume 48 bit value placed in uint64_t */
-static inline uint64_t __get_unaligned_be48(const uint8_t *p)
-{
-        return (uint64_t)__get_unaligned_be16(p) << 32 |
-               __get_unaligned_be32(p + 2);
+        memcpy(&u, p, 4);
+        return bswap_32(u);
 }
 
 static inline uint64_t __get_unaligned_be64(const uint8_t *p)
 {
-        return (uint64_t)__get_unaligned_be32(p) << 32 |
-               __get_unaligned_be32(p + 4);
+        uint64_t u;
+
+        memcpy(&u, p, 8);
+        return bswap_64(u);
 }
 
 static inline void __put_unaligned_be16(uint16_t val, uint8_t *p)
 {
-        *p++ = (uint8_t)(val >> 8);
-        *p++ = (uint8_t)val;
+        uint16_t u = bswap_16(val);
+
+        memcpy(p, &u, 2);
 }
 
 static inline void __put_unaligned_be32(uint32_t val, uint8_t *p)
 {
-        __put_unaligned_be16(val >> 16, p);
-        __put_unaligned_be16(val, p + 2);
+        uint32_t u = bswap_32(val);
+
+        memcpy(p, &u, 4);
 }
 
-/* Assume 48 bit value placed in uint64_t */
-static inline void __put_unaligned_be48(uint64_t val, uint8_t *p)
+static inline void __put_unaligned_be64(uint64_t val, uint8_t *p)
 {
-        __put_unaligned_be16(val >> 32, p);
-        __put_unaligned_be32(val, p + 2);
+        uint64_t u = bswap_64(val);
+
+        memcpy(p, &u, 8);
 }
 
-static inline void __put_unaligned_be64(uint64_t val, uint8_t *p)
+static inline uint16_t __get_unaligned_le16(const uint8_t *p)
 {
-        __put_unaligned_be32(val >> 32, p);
-        __put_unaligned_be32(val, p + 4);
+        uint16_t u;
+
+        memcpy(&u, p, 2);
+        return u;
 }
 
-static inline uint16_t sg_get_unaligned_be16(const void *p)
+static inline uint32_t __get_unaligned_le32(const uint8_t *p)
 {
-        return __get_unaligned_be16((const uint8_t *)p);
+        uint32_t u;
+
+        memcpy(&u, p, 4);
+        return u;
 }
 
-static inline uint32_t sg_get_unaligned_be24(const void *p)
+static inline uint64_t __get_unaligned_le64(const uint8_t *p)
 {
-        return ((const uint8_t *)p)[0] << 16 | ((const uint8_t *)p)[1] << 8 |
-               ((const uint8_t *)p)[2];
+        uint64_t u;
+
+        memcpy(&u, p, 8);
+        return u;
 }
 
-static inline uint32_t sg_get_unaligned_be32(const void *p)
+static inline void __put_unaligned_le16(uint16_t val, uint8_t *p)
 {
-        return __get_unaligned_be32((const uint8_t *)p);
+        memcpy(p, &val, 2);
 }
 
-/* Assume 48 bit value placed in uint64_t */
-static inline uint64_t sg_get_unaligned_be48(const void *p)
+static inline void __put_unaligned_le32(uint32_t val, uint8_t *p)
 {
-        return __get_unaligned_be48((const uint8_t *)p);
+        memcpy(p, &val, 4);
 }
 
-static inline uint64_t sg_get_unaligned_be64(const void *p)
+static inline void __put_unaligned_le64(uint64_t val, uint8_t *p)
 {
-        return __get_unaligned_be64((const uint8_t *)p);
+        memcpy(p, &val, 8);
 }
 
-/* Returns 0 if 'num_bytes' is less than or equal to 0 or greater than
- * 8 (i.e. sizeof(uint64_t)). Else returns result in uint64_t which is
- * an 8 byte unsigned integer. */
-static inline uint64_t sg_get_unaligned_be(int num_bytes, const void *p)
+#elif defined(__BIG_ENDIAN__) || (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+
+#define GOT_UNALIGNED_SPECIALS 1
+
+#include <byteswap.h>
+
+// #warning ">>>>>> Doing BIG endian special unaligneds"
+
+static inline uint16_t __get_unaligned_le16(const uint8_t *p)
 {
-        if ((num_bytes <= 0) || (num_bytes > (int)sizeof(uint64_t)))
-                return 0;
-        else {
-                const uint8_t * xp = (const uint8_t *)p;
-                uint64_t res = *xp;
+        uint16_t u;
 
-                for (++xp; num_bytes > 1; ++xp, --num_bytes)
-                        res = (res << 8) | *xp;
-                return res;
-        }
+        memcpy(&u, p, 2);
+        return bswap_16(u);
 }
 
-static inline void sg_put_unaligned_be16(uint16_t val, void *p)
+static inline uint32_t __get_unaligned_le32(const uint8_t *p)
 {
-        __put_unaligned_be16(val, (uint8_t *)p);
+        uint32_t u;
+
+        memcpy(&u, p, 4);
+        return bswap_32(u);
 }
 
-static inline void sg_put_unaligned_be24(uint32_t val, void *p)
+static inline uint64_t __get_unaligned_le64(const uint8_t *p)
 {
-        ((uint8_t *)p)[0] = (val >> 16) & 0xff;
-        ((uint8_t *)p)[1] = (val >> 8) & 0xff;
-        ((uint8_t *)p)[2] = val & 0xff;
+        uint64_t u;
+
+        memcpy(&u, p, 8);
+        return bswap_64(u);
 }
 
-static inline void sg_put_unaligned_be32(uint32_t val, void *p)
+static inline void __put_unaligned_le16(uint16_t val, uint8_t *p)
 {
-        __put_unaligned_be32(val, (uint8_t *)p);
+        uint16_t u = bswap_16(val);
+
+        memcpy(p, &u, 2);
 }
 
-/* Assume 48 bit value placed in uint64_t */
-static inline void sg_put_unaligned_be48(uint64_t val, void *p)
+static inline void __put_unaligned_le32(uint32_t val, uint8_t *p)
 {
-        __put_unaligned_be48(val, (uint8_t *)p);
+        uint32_t u = bswap_32(val);
+
+        memcpy(p, &u, 4);
 }
 
-static inline void sg_put_unaligned_be64(uint64_t val, void *p)
+static inline void __put_unaligned_le64(uint64_t val, uint8_t *p)
 {
-        __put_unaligned_be64(val, (uint8_t *)p);
+        uint64_t u = bswap_64(val);
+
+        memcpy(p, &u, 8);
 }
 
-/* Since cdb and parameter blocks are often memset to zero before these
- * unaligned function partially fill them, then check for a val of zero
- * and ignore if it is with these variants. */
-static inline void sg_nz_put_unaligned_be16(uint16_t val, void *p)
+static inline uint16_t __get_unaligned_be16(const uint8_t *p)
 {
-        if (val)
-                __put_unaligned_be16(val, (uint8_t *)p);
+        uint16_t u;
+
+        memcpy(&u, p, 2);
+        return u;
 }
 
-static inline void sg_nz_put_unaligned_be24(uint32_t val, void *p)
+static inline uint32_t __get_unaligned_be32(const uint8_t *p)
 {
-        if (val) {
-                ((uint8_t *)p)[0] = (val >> 16) & 0xff;
-                ((uint8_t *)p)[1] = (val >> 8) & 0xff;
-                ((uint8_t *)p)[2] = val & 0xff;
-        }
+        uint32_t u;
+
+        memcpy(&u, p, 4);
+        return u;
 }
 
-static inline void sg_nz_put_unaligned_be32(uint32_t val, void *p)
+static inline uint64_t __get_unaligned_be64(const uint8_t *p)
 {
-        if (val)
-                __put_unaligned_be32(val, (uint8_t *)p);
+        uint64_t u;
+
+        memcpy(&u, p, 8);
+        return u;
 }
 
-static inline void sg_nz_put_unaligned_be64(uint64_t val, void *p)
+static inline void __put_unaligned_be16(uint16_t val, uint8_t *p)
 {
-        if (val)
-            __put_unaligned_be64(val, (uint8_t *)p);
+        memcpy(p, &val, 2);
+}
+
+static inline void __put_unaligned_be32(uint32_t val, uint8_t *p)
+{
+        memcpy(p, &val, 4);
+}
+
+static inline void __put_unaligned_be64(uint64_t val, uint8_t *p)
+{
+        memcpy(p, &val, 8);
 }
 
+#endif          /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__  */
+#endif          /* #if defined __BYTE_ORDER__ && defined <byteswap.h> &&
+                 *     ! defined IGNORE_FAST_LEBE */
+
+
+#ifndef GOT_UNALIGNED_SPECIALS
+
+/* Now we have no tricks left, so use the only way this can be done
+ * correctly in C safely: lots of shifts. */
+
+// #warning ">>>>>> Doing GENERIC unaligneds"
+
+static inline uint16_t __get_unaligned_be16(const uint8_t *p)
+{
+        return p[0] << 8 | p[1];
+}
+
+static inline uint32_t __get_unaligned_be32(const uint8_t *p)
+{
+        return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
+}
+
+static inline uint64_t __get_unaligned_be64(const uint8_t *p)
+{
+        return (uint64_t)__get_unaligned_be32(p) << 32 |
+               __get_unaligned_be32(p + 4);
+}
+
+static inline void __put_unaligned_be16(uint16_t val, uint8_t *p)
+{
+        *p++ = (uint8_t)(val >> 8);
+        *p++ = (uint8_t)val;
+}
+
+static inline void __put_unaligned_be32(uint32_t val, uint8_t *p)
+{
+        __put_unaligned_be16(val >> 16, p);
+        __put_unaligned_be16(val, p + 2);
+}
+
+static inline void __put_unaligned_be64(uint64_t val, uint8_t *p)
+{
+        __put_unaligned_be32(val >> 32, p);
+        __put_unaligned_be32(val, p + 4);
+}
 
-/* Below are the little endian equivalents of the big endian functions
- * above. Little endian is used by ATA, PCI and NVMe.
- */
 
 static inline uint16_t __get_unaligned_le16(const uint8_t *p)
 {
@@ -210,27 +316,50 @@ static inline void __put_unaligned_le64(uint64_t val, uint8_t *p)
         __put_unaligned_le32(val, p);
 }
 
-static inline uint16_t sg_get_unaligned_le16(const void *p)
+#endif          /* #ifndef GOT_UNALIGNED_SPECIALS */
+
+
+/* These are the end user function, essentially dummies just doing a
+ * cast to the __ functions that do the work. Hopefully the compiler
+ * inlines these functions (as instructed). */
+static inline uint16_t sg_get_unaligned_be16(const void *p)
 {
-        return __get_unaligned_le16((const uint8_t *)p);
+        return __get_unaligned_be16((const uint8_t *)p);
 }
 
-static inline uint32_t sg_get_unaligned_le24(const void *p)
+static inline uint32_t sg_get_unaligned_be32(const void *p)
 {
-        return (uint32_t)__get_unaligned_le16((const uint8_t *)p) |
-               ((const uint8_t *)p)[2] << 16;
+        return __get_unaligned_be32((const uint8_t *)p);
 }
 
-static inline uint32_t sg_get_unaligned_le32(const void *p)
+static inline uint64_t sg_get_unaligned_be64(const void *p)
 {
-        return __get_unaligned_le32((const uint8_t *)p);
+        return __get_unaligned_be64((const uint8_t *)p);
 }
 
-/* Assume 48 bit value placed in uint64_t */
-static inline uint64_t sg_get_unaligned_le48(const void *p)
+static inline void sg_put_unaligned_be16(uint16_t val, void *p)
 {
-        return (uint64_t)__get_unaligned_le16((const uint8_t *)p + 4) << 32 |
-               __get_unaligned_le32((const uint8_t *)p);
+        __put_unaligned_be16(val, (uint8_t *)p);
+}
+
+static inline void sg_put_unaligned_be32(uint32_t val, void *p)
+{
+        __put_unaligned_be32(val, (uint8_t *)p);
+}
+
+static inline void sg_put_unaligned_be64(uint64_t val, void *p)
+{
+        __put_unaligned_be64(val, (uint8_t *)p);
+}
+
+static inline uint16_t sg_get_unaligned_le16(const void *p)
+{
+        return __get_unaligned_le16((const uint8_t *)p);
+}
+
+static inline uint32_t sg_get_unaligned_le32(const void *p)
+{
+        return __get_unaligned_le32((const uint8_t *)p);
 }
 
 static inline uint64_t sg_get_unaligned_le64(const void *p)
@@ -238,26 +367,88 @@ static inline uint64_t sg_get_unaligned_le64(const void *p)
         return __get_unaligned_le64((const uint8_t *)p);
 }
 
+static inline void sg_put_unaligned_le16(uint16_t val, void *p)
+{
+        __put_unaligned_le16(val, (uint8_t *)p);
+}
+
+static inline void sg_put_unaligned_le32(uint32_t val, void *p)
+{
+        __put_unaligned_le32(val, (uint8_t *)p);
+}
+
+static inline void sg_put_unaligned_le64(uint64_t val, void *p)
+{
+        __put_unaligned_le64(val, (uint8_t *)p);
+}
+
+/* Following are lesser used conversions that don't have specializations
+ * for endianess; big endian first. In summary these are the 24, 48 bit and
+ * given-length conversions plus the "nz" conditional put conversions. */
+
+/* Now big endian, get 24+48 then put 24+48 */
+static inline uint32_t sg_get_unaligned_be24(const void *p)
+{
+        return ((const uint8_t *)p)[0] << 16 | ((const uint8_t *)p)[1] << 8 |
+               ((const uint8_t *)p)[2];
+}
+
+/* Assume 48 bit value placed in uint64_t */
+static inline uint64_t sg_get_unaligned_be48(const void *p)
+{
+        return (uint64_t)__get_unaligned_be16((const uint8_t *)p) << 32 |
+               __get_unaligned_be32((const uint8_t *)p + 2);
+}
+
 /* Returns 0 if 'num_bytes' is less than or equal to 0 or greater than
  * 8 (i.e. sizeof(uint64_t)). Else returns result in uint64_t which is
  * an 8 byte unsigned integer. */
-static inline uint64_t sg_get_unaligned_le(int num_bytes, const void *p)
+static inline uint64_t sg_get_unaligned_be(int num_bytes, const void *p)
 {
         if ((num_bytes <= 0) || (num_bytes > (int)sizeof(uint64_t)))
                 return 0;
         else {
-                const uint8_t * xp = (const uint8_t *)p + (num_bytes - 1);
+                const uint8_t * xp = (const uint8_t *)p;
                 uint64_t res = *xp;
 
-                for (--xp; num_bytes > 1; --xp, --num_bytes)
+                for (++xp; num_bytes > 1; ++xp, --num_bytes)
                         res = (res << 8) | *xp;
                 return res;
         }
 }
 
-static inline void sg_put_unaligned_le16(uint16_t val, void *p)
+static inline void sg_put_unaligned_be24(uint32_t val, void *p)
 {
-        __put_unaligned_le16(val, (uint8_t *)p);
+        ((uint8_t *)p)[0] = (val >> 16) & 0xff;
+        ((uint8_t *)p)[1] = (val >> 8) & 0xff;
+        ((uint8_t *)p)[2] = val & 0xff;
+}
+
+/* Assume 48 bit value placed in uint64_t */
+static inline void __put_unaligned_be48(uint64_t val, uint8_t *p)
+{
+        __put_unaligned_be16(val >> 32, p);
+        __put_unaligned_be32(val, p + 2);
+}
+
+/* Assume 48 bit value placed in uint64_t */
+static inline void sg_put_unaligned_be48(uint64_t val, void *p)
+{
+        __put_unaligned_be48(val, (uint8_t *)p);
+}
+
+/* Now little endian, get 24+48 then put 24+48 */
+static inline uint32_t sg_get_unaligned_le24(const void *p)
+{
+        return (uint32_t)__get_unaligned_le16((const uint8_t *)p) |
+               ((const uint8_t *)p)[2] << 16;
+}
+
+/* Assume 48 bit value placed in uint64_t */
+static inline uint64_t sg_get_unaligned_le48(const void *p)
+{
+        return (uint64_t)__get_unaligned_le16((const uint8_t *)p + 4) << 32 |
+               __get_unaligned_le32((const uint8_t *)p);
 }
 
 static inline void sg_put_unaligned_le24(uint32_t val, void *p)
@@ -267,11 +458,6 @@ static inline void sg_put_unaligned_le24(uint32_t val, void *p)
         ((uint8_t *)p)[0] = val & 0xff;
 }
 
-static inline void sg_put_unaligned_le32(uint32_t val, void *p)
-{
-        __put_unaligned_le32(val, (uint8_t *)p);
-}
-
 /* Assume 48 bit value placed in uint64_t */
 static inline void sg_put_unaligned_le48(uint64_t val, void *p)
 {
@@ -283,14 +469,53 @@ static inline void sg_put_unaligned_le48(uint64_t val, void *p)
         ((uint8_t *)p)[0] = val & 0xff;
 }
 
-static inline void sg_put_unaligned_le64(uint64_t val, void *p)
+/* Returns 0 if 'num_bytes' is less than or equal to 0 or greater than
+ * 8 (i.e. sizeof(uint64_t)). Else returns result in uint64_t which is
+ * an 8 byte unsigned integer. */
+static inline uint64_t sg_get_unaligned_le(int num_bytes, const void *p)
 {
-        __put_unaligned_le64(val, (uint8_t *)p);
+        if ((num_bytes <= 0) || (num_bytes > (int)sizeof(uint64_t)))
+                return 0;
+        else {
+                const uint8_t * xp = (const uint8_t *)p + (num_bytes - 1);
+                uint64_t res = *xp;
+
+                for (--xp; num_bytes > 1; --xp, --num_bytes)
+                        res = (res << 8) | *xp;
+                return res;
+        }
 }
 
 /* Since cdb and parameter blocks are often memset to zero before these
  * unaligned function partially fill them, then check for a val of zero
- * and ignore if it is with these variants. */
+ * and ignore if it is with these variants. First big endian, then little */
+static inline void sg_nz_put_unaligned_be16(uint16_t val, void *p)
+{
+        if (val)
+                __put_unaligned_be16(val, (uint8_t *)p);
+}
+
+static inline void sg_nz_put_unaligned_be24(uint32_t val, void *p)
+{
+        if (val) {
+                ((uint8_t *)p)[0] = (val >> 16) & 0xff;
+                ((uint8_t *)p)[1] = (val >> 8) & 0xff;
+                ((uint8_t *)p)[2] = val & 0xff;
+        }
+}
+
+static inline void sg_nz_put_unaligned_be32(uint32_t val, void *p)
+{
+        if (val)
+                __put_unaligned_be32(val, (uint8_t *)p);
+}
+
+static inline void sg_nz_put_unaligned_be64(uint64_t val, void *p)
+{
+        if (val)
+            __put_unaligned_be64(val, (uint8_t *)p);
+}
+
 static inline void sg_nz_put_unaligned_le16(uint16_t val, void *p)
 {
         if (val)
@@ -318,6 +543,7 @@ static inline void sg_nz_put_unaligned_le64(uint64_t val, void *p)
             __put_unaligned_le64(val, (uint8_t *)p);
 }
 
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 31004776..2acb18da 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -39,6 +39,7 @@ endif
 ## CC = g++
 ## CC = clang
 ## CC = clang++
+## CC = powerpc64-linux-gnu-gcc
 
 # -std=<s> can be c99, c11, gnu11, etc. Default is gnu11 for C code
 # -Wall is no longer all warnings. Add -W (since renamed to -Wextra) for more
diff --git a/src/Makefile.am b/src/Makefile.am
index 9996d015..2407435f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -39,6 +39,7 @@ endif
 ## CC = g++
 ## CC = clang
 ## CC = clang++
+## CC = powerpc64-linux-gnu-gcc
 
 # -std=<s> can be c99, c11, gnu11, etc. Default is gnu11
 # -Wall is no longer all warnings. Add -W (since renamed to -Wextra) for more
diff --git a/src/sg_get_lba_status.c b/src/sg_get_lba_status.c
index aaff98e6..29e95115 100644
--- a/src/sg_get_lba_status.c
+++ b/src/sg_get_lba_status.c
@@ -32,7 +32,7 @@
  * device.
  */
 
-static const char * version_str = "1.13 20180219";
+static const char * version_str = "1.14 20180311";
 
 #ifndef UINT32_MAX
 #define UINT32_MAX ((uint32_t)-1)
@@ -168,7 +168,7 @@ main(int argc, char * argv[])
     const char * device_name = NULL;
     const uint8_t * bp;
     int ret = 0;
-    uint8_t add_status;
+    uint8_t add_status = 0;     /* keep gcc quiet */
 
     while (1) {
         int option_index = 0;
diff --git a/testing/tst_sg_lib.c b/testing/tst_sg_lib.c
index 122d2035..b8e2e2ad 100644
--- a/testing/tst_sg_lib.c
+++ b/testing/tst_sg_lib.c
@@ -17,7 +17,23 @@
 #define __STDC_FORMAT_MACROS 1
 #include <inttypes.h>
 
+#include <time.h>
+
+#ifdef __GNUC__
+#include <byteswap.h>
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"     /* need this to see if HAVE_BYTESWAP_H */
+#endif
+
 #include "sg_lib.h"
+
+/* Uncomment the next two undefs to force use of the generic (i.e. shifting)
+ * unaligned functions (i.e. sg_get_* and sg_put_*). Use "-b 16|32|64
+ * -n 100m" to see the differences in timing. */
+/* #undef HAVE_CONFIG_H */
+/* #undef HAVE_BYTESWAP_H */
 #include "sg_unaligned.h"
 
 /*
@@ -25,17 +41,19 @@
  * related to snprintf().
  */
 
-static const char * version_str = "1.08 20180223";
+static const char * version_str = "1.10 20180311";
 
 
 #define MAX_LINE_LEN 1024
 
 
 static struct option long_options[] = {
+        {"byteswap",  required_argument, 0, 'b'},
         {"exit", no_argument, 0, 'e'},
         {"help", no_argument, 0, 'h'},
         {"hex2",  no_argument, 0, 'H'},
         {"leadin",  required_argument, 0, 'l'},
+        {"num",  required_argument, 0, 'n'},
         {"printf", no_argument, 0, 'p'},
         {"sense", no_argument, 0, 's'},
         {"unaligned", no_argument, 0, 'u'},
@@ -141,12 +159,21 @@ usage()
             "[--printf]\n"
             "                  [--sense] [--unaligned] [--verbose] "
             "[--version]\n"
+#ifdef __GNUC__
+            "  where: --byteswap=B|-b B    B is 16, 32 or 64; tests "
+            "NUM byteswaps\n"
+            "                              compared to sg_unaligned "
+            "equivalent\n"
+            "         --exit|-e          test exit status strings\n"
+#else
             "  where: --exit|-e          test exit status strings\n"
+#endif
             "         --help|-h          print out usage message\n"
             "         --hex2|-H          test hex2* variants\n"
             "         --leadin=STR|-l STR    every line output by --sense "
             "should\n"
             "                                be prefixed by STR\n"
+            "         --num=NUM|-n NUM    number of iterations (def=1)\n"
             "         --printf|-p        test library printf variants\n"
             "         --sense|-s         test sense data handling\n"
             "         --unaligned|-u     test unaligned data handling\n"
@@ -196,6 +223,9 @@ get_exit_status_str(int exit_status, bool longer, int b_len, char * b)
     return b;
 }
 
+static uint8_t arr[64];
+
+#define OFF 8
 
 int
 main(int argc, char * argv[])
@@ -203,7 +233,9 @@ main(int argc, char * argv[])
     bool do_exit_status = false;
     bool ok;
     int k, c, n, len;
+    int byteswap_sz = 0;
     int do_hex2 = 0;
+    int do_num = 1;
     int do_printf = 0;
     int do_sense = 0;
     int do_unaligned = 0;
@@ -217,12 +249,20 @@ main(int argc, char * argv[])
     while (1) {
         int option_index = 0;
 
-        c = getopt_long(argc, argv, "ehHl:psuvV", long_options,
+        c = getopt_long(argc, argv, "b:ehHl:n:psuvV", long_options,
                         &option_index);
         if (c == -1)
             break;
 
         switch (c) {
+        case 'b':
+            byteswap_sz = sg_get_num(optarg);
+            if (! ((16 == byteswap_sz) || (32 == byteswap_sz) ||
+                   (64 == byteswap_sz))) {
+                fprintf(stderr, "--byteswap= requires 16, 32 or 64\n");
+                return 1;
+            }
+            break;
         case 'e':
             do_exit_status = true;
             break;
@@ -236,6 +276,13 @@ main(int argc, char * argv[])
         case 'l':
             leadin = optarg;
             break;
+        case 'n':
+            do_num = sg_get_num(optarg);
+            if (do_num < 0) {
+                fprintf(stderr, "--num= unable decode argument as number\n");
+                return 1;
+            }
+            break;
         case 'p':
             ++do_printf;
             break;
@@ -441,7 +488,7 @@ main(int argc, char * argv[])
         uint16_t u16r;
         uint32_t u24 = 0x224488;
         uint32_t u24r;
-        uint32_t u32 = 0x224488ff;
+        uint32_t u32 = 0x224488aa;
         uint32_t u32r;
         uint64_t u48 = 0x112233445566ULL;
         uint64_t u48r;
@@ -511,15 +558,111 @@ main(int argc, char * argv[])
         hex2stdout(u8, vb ? 10 : 8, -1);
         u64r = sg_get_unaligned_be64(u8);
         printf("  u64r=0x%" PRIx64 "\n\n", u64r);
-        printf("  be[8]:\n");
+
+        printf("  be[v=8 bytes]:\n");
         hex2stdout(u8, vb ? 10 : 8, -1);
         u64r = sg_get_unaligned_be(8, u8);
-        printf("  u64r[8]=0x%" PRIx64 "\n\n", u64r);
-        printf("  le[8]:\n");
+        printf("  u64r[v=8 bytes]=0x%" PRIx64 "\n", u64r);
+        printf("  le[v=8 bytes]:\n");
+        hex2stdout(u8, vb ? 10 : 8, -1);
         u64r = sg_get_unaligned_le(8, u8);
-        printf("  u64r[8]=0x%" PRIx64 "\n\n", u64r);
+        printf("  u64r[v=8 bytes]=0x%" PRIx64 "\n\n", u64r);
+    }
 
+#ifdef __GNUC__
+    if (byteswap_sz > 0) {
+        uint32_t elapsed_msecs;
+        uint16_t count16 = 0;
+        uint32_t count32 = 0;
+        uint64_t count64 = 0;
+        struct timespec start_tm, end_tm;
+
+        ++did_something;
+        if (0 != clock_gettime(CLOCK_MONOTONIC, &start_tm)) {
+            perror("clock_gettime(CLOCK_MONOTONIC)\n");
+            return 1;
+        }
+        for (k = 0; k < do_num; ++k) {
+            switch (byteswap_sz) {
+            case 16:
+                sg_put_unaligned_be16(count16 + 1, arr + OFF);
+                count16 = sg_get_unaligned_be16(arr + OFF);
+                break;
+            case 32:
+                sg_put_unaligned_be32(count32 + 1, arr + OFF);
+                count32 = sg_get_unaligned_be32(arr + OFF);
+                break;
+            case 64:
+                sg_put_unaligned_be64(count64 + 1, arr + OFF);
+                count64 = sg_get_unaligned_be64(arr + OFF);
+                break;
+            default:
+                break;
+            }
+        }
+        if (0 != clock_gettime(CLOCK_MONOTONIC, &end_tm)) {
+            perror("clock_gettime(CLOCK_MONOTONIC)\n");
+            return 1;
+        }
+        elapsed_msecs = (end_tm.tv_sec - start_tm.tv_sec) * 1000;
+        elapsed_msecs += (end_tm.tv_nsec - start_tm.tv_nsec) / 1000000;
+        if (16 == byteswap_sz)
+            printf("  count16=%u\n", count16);
+        else if (32 == byteswap_sz)
+            printf("  count32=%u\n", count32);
+        else
+            printf("  count64=%" PRIu64 "\n", count64);
+        printf("Unaligned elapsed milliseconds: %u\n", elapsed_msecs);
+        count16 = 0;
+        count32 = 0;
+        count64 = 0;
+
+        if (0 != clock_gettime(CLOCK_MONOTONIC, &start_tm)) {
+            perror("clock_gettime(CLOCK_MONOTONIC)\n");
+            return 1;
+        }
+        for (k = 0; k < do_num; ++k) {
+            switch (byteswap_sz) {
+            case 16:
+                count16 = bswap_16(count16 + 1);
+                memcpy(arr + OFF, &count16, 2);
+                memcpy(&count16, arr + OFF, 2);
+                count16 = bswap_16(count16);
+                break;
+            case 32:
+                count32 = bswap_32(count32 + 1);
+                memcpy(arr + OFF, &count32, 4);
+                memcpy(&count32, arr + OFF, 4);
+                count32 = bswap_32(count32);
+                break;
+            case 64:
+                count64 = bswap_64(count64 + 1);
+                memcpy(arr + OFF, &count64, 8);
+                memcpy(&count64, arr + OFF, 8);
+                count64 = bswap_64(count64);
+                break;
+            default:
+                break;
+            }
+        }
+        if (0 != clock_gettime(CLOCK_MONOTONIC, &end_tm)) {
+            perror("clock_gettime(CLOCK_MONOTONIC)\n");
+            return 1;
+        }
+        elapsed_msecs = (end_tm.tv_sec - start_tm.tv_sec) * 1000;
+        elapsed_msecs += (end_tm.tv_nsec - start_tm.tv_nsec) / 1000000;
+        if (16 == byteswap_sz)
+            printf("  count16=%u\n", count16);
+        else if (32 == byteswap_sz)
+            printf("  count32=%u\n", count32);
+        else
+            printf("  count64=%" PRIu64 "\n", count64);
+        printf("Byteswap/memcpy elapsed milliseconds: %u\n", elapsed_msecs);
+        count16 = 0;
+        count32 = 0;
+        count64 = 0;
     }
+#endif
 
     if (0 == did_something)
         printf("Looks like no tests done, check usage with '-h'\n");