Add Mips support to libpixelflinger

Change-Id: Ib81cb01b8d90ed1afa1fd54b3cc009d7fec0f814
diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk
index ed2ab5e..1947c2d 100644
--- a/libpixelflinger/Android.mk
+++ b/libpixelflinger/Android.mk
@@ -43,6 +43,11 @@
 PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer
 endif
 
+ifeq ($(TARGET_ARCH),mips)
+PIXELFLINGER_SRC_FILES += arch-mips/t32cb16blend.S
+PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer
+endif
+
 LOCAL_SHARED_LIBRARIES := libcutils
 
 ifneq ($(TARGET_ARCH),arm)
diff --git a/libpixelflinger/arch-mips/t32cb16blend.S b/libpixelflinger/arch-mips/t32cb16blend.S
new file mode 100644
index 0000000..c911fbb
--- /dev/null
+++ b/libpixelflinger/arch-mips/t32cb16blend.S
@@ -0,0 +1,264 @@
+/* libs/pixelflinger/t32cb16blend.S
+**
+** Copyright 2010, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifdef DEBUG
+#define DBG
+#else
+#define DBG #
+#endif
+
+/*
+ * blend one of 2 16bpp RGB pixels held in dreg selected by shift
+ * with the 32bpp ABGR pixel held in src and store the result in fb
+ *
+ * Assumes that the dreg data is little endian and that
+ * the the second pixel (shift==16) will be merged into
+ * the fb result
+ *
+ * Uses $t0,$t6,$t7,$t8
+ */
+
+#if __mips==32 && __mips_isa_rev>=2
+	.macro pixel dreg src fb shift
+	/*
+	 * sA = s >> 24
+	 * f = 0x100 - (sA + (sA>>7))
+	 */
+DBG	.set	noat
+DBG	rdhwr	$at,$2
+DBG	.set	at
+
+	srl	$t7,\src,24
+	srl	$t6,$t7,7
+	addu	$t7,$t6
+	li	$t6,0x100
+	subu	$t7,$t6,$t7
+
+	/* red */
+	ext	$t8,\dreg,\shift+6+5,5			# dst[\shift:15..11]
+	mul	$t6,$t8,$t7
+	ext	$t0,\dreg,\shift+5,6			# start green extraction dst[\shift:10..5]
+	ext	$t8,\src,3,5				# src[7..3]
+	srl	$t6,8
+	addu	$t8,$t6
+	ins	\fb,$t8,\shift+6+5,5			# dst[\shift:15..11]
+
+        /* green */
+	mul	$t8,$t0,$t7
+	ext	$t0,\dreg,\shift,5			# start blue extraction dst[\shift:4..0]
+	ext	$t6,\src,2+8,6				# src[15..10]
+	srl	$t8,8
+        addu	$t8,$t6
+
+	/* blue */
+	mul	$t0,$t0,$t7
+	ins	\fb,$t8,\shift+5,6			# finish green insertion dst[\shift:10..5]
+	ext	$t6,\src,(3+8+8),5
+	srl	$t8,$t0,8
+	addu	$t8,$t6
+	ins	\fb,$t8,\shift,5
+
+DBG	.set	noat
+DBG	rdhwr	$t8,$2
+DBG	subu	$t8,$at
+DBG	sltu	$at,$t8,$v0
+DBG	movn	$v0,$t8,$at
+DBG	sgtu	$at,$t8,$v1
+DBG	movn	$v1,$t8,$at
+DBG	.set	at
+	.endm
+
+#else
+
+	.macro pixel dreg src fb shift
+	/*
+	 * sA = s >> 24
+	 * f = 0x100 - (sA + (sA>>7))
+	 */
+DBG	.set	push
+DBG	.set	noat
+DBG	.set	mips32r2
+DBG 	rdhwr	$at,$2
+DBG	.set	pop
+
+	srl	$t7,\src,24
+	srl	$t6,$t7,7
+	addu	$t7,$t6
+	li	$t6,0x100
+	subu	$t7,$t6,$t7
+
+	/*
+	 * red
+	 * dR = (d >> (6 + 5)) & 0x1f;
+	 * dR = (f*dR)>>8
+	 * sR = (s >> (   3)) & 0x1f;
+	 * sR += dR
+	 * fb |= sR << 11
+	 */
+	srl	$t8,\dreg,\shift+6+5
+.if \shift==0
+	and     $t8,0x1f
+.endif
+	mul	$t8,$t8,$t7
+	srl	$t6,\src,3
+	and	$t6,0x1f
+	srl	$t8,8
+	addu	$t8,$t6
+.if \shift!=0
+	sll	$t8,\shift+11
+	or	\fb,$t8
+.else
+	sll	\fb,$t8,11
+.endif
+
+        /*
+	 * green
+	 * dG = (d >> 5) & 0x3f
+	 * dG = (f*dG) >> 8
+	 * sG = (s >> ( 8+2))&0x3F;
+	 */
+	srl	$t8,\dreg,\shift+5
+        and	$t8,0x3f
+	mul	$t8,$t8,$t7
+        srl	$t6,\src,8+2
+        and     $t6,0x3f
+	srl	$t8,8
+        addu	$t8,$t6
+	sll	$t8,\shift + 5
+	or	\fb,$t8
+
+	/* blue */
+.if \shift!=0
+	srl	$t8,\dreg,\shift
+	and	$t8,0x1f
+.else
+	and	$t8,\dreg,0x1f
+.endif
+	mul	$t8,$t8,$t7
+	srl	$t6,\src,(8+8+3)
+	and	$t6,0x1f
+	srl	$t8,8
+	addu	$t8,$t6
+.if \shift!=0
+	sll	$t8,\shift
+.endif
+	or	\fb,$t8
+DBG	.set	push
+DBG	.set	noat
+DBG	.set	mips32r2
+DBG	rdhwr	$t8,$2
+DBG	subu	$t8,$at
+DBG	sltu	$at,$t8,$v0
+DBG	movn	$v0,$t8,$at
+DBG	sgtu	$at,$t8,$v1
+DBG	movn	$v1,$t8,$at
+DBG	.set	pop
+	.endm
+#endif
+
+	.text
+	.align
+
+	.global scanline_t32cb16blend_mips
+	.ent	scanline_t32cb16blend_mips
+scanline_t32cb16blend_mips:
+DBG	li	$v0,0xffffffff
+DBG	li	$v1,0
+	/* Align the destination if necessary */
+	and	$t0,$a0,3
+	beqz	$t0,aligned
+
+	/* as long as there is at least one pixel */
+	beqz	$a2,done
+
+	lw	$t4,($a1)
+	addu	$a0,2
+	addu	$a1,4
+	beqz	$t4,1f
+	lhu	$t3,-2($a0)
+	pixel   $t3,$t4,$t1,0
+	sh	$t1,-2($a0)
+1:	subu	$a2,1
+
+aligned:
+	/* Check to see if its worth unrolling the loop */
+	subu	$a2,4
+	bltz	$a2,tail
+
+	/* Process 4 pixels at a time */
+fourpixels:
+	/* 1st pair of pixels */
+	lw	$t4,0($a1)
+	lw	$t5,4($a1)
+	addu	$a0,8
+	addu	$a1,16
+
+	/* both are zero, skip this pair */
+	or	$t3,$t4,$t5
+	beqz	$t3,1f
+
+	/* load the destination */
+	lw	$t3,-8($a0)
+
+	pixel	$t3,$t4,$t1,0
+	pixel	$t3,$t5,$t1,16
+	sw	$t1,-8($a0)
+
+1:
+	/* 2nd pair of pixels */
+	lw	$t4,-8($a1)
+	lw	$t5,-4($a1)
+
+	/* both are zero, skip this pair */
+	or	$t3,$t4,$t5
+	beqz	$t3,1f
+
+	/* load the destination */
+	lw	$t3,-4($a0)
+
+	pixel	$t3,$t4,$t1,0
+	pixel	$t3,$t5,$t1,16
+	sw	$t1,-4($a0)
+
+1:	subu    $a2,4
+	bgtz	$a2,fourpixels
+
+tail:
+	/* the pixel count underran, restore it now */
+	addu	$a2,4
+
+	/* handle the last 0..3 pixels */
+	beqz	$a2,done
+onepixel:
+	lw	$t4,($a1)
+	addu	$a0,2
+	addu	$a1,4
+	beqz	$t4,1f
+	lhu	$t3,-2($a0)
+	pixel   $t3,$t4,$t1,0
+	sh	$t1,-2($a0)
+1:	subu	$a2,1
+	bnez	$a2,onepixel
+done:
+DBG	.set    push
+DBG	.set    mips32r2
+DBG 	rdhwr	$a0,$3
+DBG 	mul	$v0,$a0
+DBG 	mul	$v1,$a0
+DBG	.set    pop
+	j	$ra
+	.end	scanline_t32cb16blend_mips
diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp
index 93440f5..d1f3d96 100644
--- a/libpixelflinger/scanline.cpp
+++ b/libpixelflinger/scanline.cpp
@@ -110,10 +110,14 @@
 static void rect_generic(context_t* c, size_t yc);
 static void rect_memcpy(context_t* c, size_t yc);
 
+#if defined( __arm__)
 extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
 extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
 extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
+#elif defined(__mips__)
+extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
+#endif
 
 // ----------------------------------------------------------------------------
 
@@ -2136,7 +2140,7 @@
 
 void scanline_t32cb16blend(context_t* c)
 {
-#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__mips)))
     int32_t x = c->iterators.xl;
     size_t ct = c->iterators.xr - x;
     int32_t y = c->iterators.y;
@@ -2148,8 +2152,12 @@
     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
 
+#ifdef __arm__
     scanline_t32cb16blend_arm(dst, src, ct);
 #else
+    scanline_t32cb16blend_mips(dst, src, ct);
+#endif
+#else
     dst_iterator16  di(c);
     horz_iterator32  hi(c);
     blender_32to16  bl(c);