Pixelflinger: Add AArch64 support to pixelflinger JIT.

See the comment-block at the top of Aarch64Assembler.cpp
for overview on how AArch64 support has been implemented

In addition, this commit contains
[x] AArch64 inline asm versions of gglmul series of
    functions and a new unit test bench to test the
    functions

[x] Assembly implementations of scanline_col32cb16blend
    and scanline_t32cb16blend for AArch64, with unit
    test bench

Change-Id: I915cded9e1d39d9a2a70bf8a0394b8a0064d1eb4
Signed-off-by: Ashok Bhat <ashok.bhat@arm.com>
diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp
index 96a71f3..bc774f3 100644
--- a/libpixelflinger/scanline.cpp
+++ b/libpixelflinger/scanline.cpp
@@ -31,8 +31,11 @@
 
 #include "codeflinger/CodeCache.h"
 #include "codeflinger/GGLAssembler.h"
+#if defined(__arm__)
 #include "codeflinger/ARMAssembler.h"
-#if defined(__mips__)
+#elif defined(__aarch64__)
+#include "codeflinger/Aarch64Assembler.h"
+#elif defined(__mips__)
 #include "codeflinger/MIPSAssembler.h"
 #endif
 //#include "codeflinger/ARMAssemblerOptimizer.h"
@@ -52,7 +55,7 @@
 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
 #endif
 
-#if defined(__arm__) || defined(__mips__)
+#if defined(__arm__) || defined(__mips__) || defined(__aarch64__)
 #   define ANDROID_ARM_CODEGEN  1
 #else
 #   define ANDROID_ARM_CODEGEN  0
@@ -68,6 +71,8 @@
 
 #ifdef __mips__
 #define ASSEMBLY_SCRATCH_SIZE   4096
+#elif defined(__aarch64__)
+#define ASSEMBLY_SCRATCH_SIZE   8192
 #else
 #define ASSEMBLY_SCRATCH_SIZE   2048
 #endif
@@ -122,6 +127,9 @@
 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
 extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
 extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
+#elif defined(__aarch64__)
+extern "C" void scanline_t32cb16blend_aarch64(uint16_t*, uint32_t*, size_t);
+extern "C" void scanline_col32cb16blend_aarch64(uint16_t *dst, uint32_t col, size_t ct);
 #elif defined(__mips__)
 extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
 #endif
@@ -276,6 +284,8 @@
 
 #if defined(__mips__)
 static CodeCache gCodeCache(32 * 1024);
+#elif defined(__aarch64__)
+static CodeCache gCodeCache(48 * 1024);
 #else
 static CodeCache gCodeCache(12 * 1024);
 #endif
@@ -394,6 +404,8 @@
 #endif
 #if defined(__mips__)
         GGLAssembler assembler( new ArmToMipsAssembler(a) );
+#elif defined(__aarch64__)
+        GGLAssembler assembler( new ArmToAarch64Assembler(a) );
 #endif
         // generate the scanline code for the given needs
         int err = assembler.scanline(c->state.needs, c);
@@ -2085,6 +2097,8 @@
 #else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
     scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
 #endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+#elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__aarch64__))
+    scanline_col32cb16blend_aarch64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
 #else
     uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
     int sA = (s>>24);
@@ -2157,7 +2171,7 @@
 
 void scanline_t32cb16blend(context_t* c)
 {
-#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__mips)))
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__mips__) || defined(__aarch64__)))
     int32_t x = c->iterators.xl;
     size_t ct = c->iterators.xr - x;
     int32_t y = c->iterators.y;
@@ -2171,7 +2185,9 @@
 
 #ifdef __arm__
     scanline_t32cb16blend_arm(dst, src, ct);
-#else
+#elif defined(__aarch64__)
+    scanline_t32cb16blend_aarch64(dst, src, ct);
+#elif defined(__mips__)
     scanline_t32cb16blend_mips(dst, src, ct);
 #endif
 #else