only use faster SHA-1 code on machines with good libc

Macs don't have byteswap.h or endian.h.  Change conditionals to only
use the faster SHA-1 code on little-endian machines with byteswap.h.
diff --git a/include/mincrypt/sha.h b/include/mincrypt/sha.h
index 2bcc5222..af63e87 100644
--- a/include/mincrypt/sha.h
+++ b/include/mincrypt/sha.h
@@ -29,7 +29,6 @@
 #define _EMBEDDED_SHA_H_
 
 #include <inttypes.h>
-#include <endian.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,7 +37,7 @@
 typedef struct SHA_CTX {
     uint64_t count;
     uint32_t state[5];
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#if defined(HAVE_ENDIAN_H) && defined(HAVE_LITTLE_ENDIAN)
     union {
         uint8_t b[64];
         uint32_t w[16];
diff --git a/libmincrypt/sha.c b/libmincrypt/sha.c
index 33d1cb3..e089d79 100644
--- a/libmincrypt/sha.c
+++ b/libmincrypt/sha.c
@@ -25,13 +25,15 @@
 ** ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include <byteswap.h>
-#include <endian.h>
-#include <memory.h>
-
 #include "mincrypt/sha.h"
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+// Some machines lack byteswap.h and endian.h.  These have to use the
+// slower code, even if they're little-endian.
+
+#if defined(HAVE_ENDIAN_H) && defined(HAVE_LITTLE_ENDIAN)
+
+#include <byteswap.h>
+#include <memory.h>
 
 // This version is about 28% faster than the generic version below,
 // but assumes little-endianness.
@@ -186,7 +188,7 @@
     return ctx->buf.b;
 }
 
-#else   // __BYTE_ORDER == BIG_ENDIAN
+#else   // #if defined(HAVE_ENDIAN_H) && defined(HAVE_LITTLE_ENDIAN)
 
 #define rol(bits, value) (((value) << (bits)) | ((value) >> (32 - (bits))))