add optimized SHA1 algorithm

This optimized implementation of the SHA1 algorithm is about 28%
faster than the old one (on sapphire hardware) but assumes
little-endianness.  Add it, but continue using the old implementation
on big-endian hardware.
diff --git a/include/mincrypt/sha.h b/include/mincrypt/sha.h
index c523460..2bcc5222 100644
--- a/include/mincrypt/sha.h
+++ b/include/mincrypt/sha.h
@@ -13,14 +13,14 @@
 **       be used to endorse or promote products derived from this software
 **       without specific prior written permission.
 **
-** THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR 
+** THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
 ** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 
+** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 ** EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
-** OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-** WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+** OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+** WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 ** OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 ** ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
@@ -29,6 +29,7 @@
 #define _EMBEDDED_SHA_H_
 
 #include <inttypes.h>
+#include <endian.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,16 +37,23 @@
 
 typedef struct SHA_CTX {
     uint64_t count;
-    uint8_t buf[64];
     uint32_t state[5];
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+    union {
+        uint8_t b[64];
+        uint32_t w[16];
+    } buf;
+#else
+    uint8_t buf[64];
+#endif
 } SHA_CTX;
 
-void SHA_init(SHA_CTX *ctx);
-void SHA_update(SHA_CTX *ctx, const void* data, int len);
-const uint8_t* SHA_final(SHA_CTX *ctx);
+void SHA_init(SHA_CTX* ctx);
+void SHA_update(SHA_CTX* ctx, const void* data, int len);
+const uint8_t* SHA_final(SHA_CTX* ctx);
 
 /* Convenience method. Returns digest parameter value. */
-const uint8_t* SHA(const void *data, int len, uint8_t *digest);
+const uint8_t* SHA(const void* data, int len, uint8_t* digest);
 
 #define SHA_DIGEST_SIZE 20