Blame - libpixelflinger/codeflinger/texturing.cpp - AOSPA/android_system_core

blob: 90e658407bfa0d8b4ced41f1faa92614dc331756 [file] [log] [blame]

The Android Open Source Project	4f6e8d7	2008-10-21 07:00:00 -0700	[diff] [blame]	1	/* libs/pixelflinger/codeflinger/texturing.cpp
				2	**
				3	** Copyright 2006, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
				18	#include <assert.h>
				19	#include <stdint.h>
				20	#include <stdlib.h>
				21	#include <stdio.h>
				22	#include <sys/types.h>
				23
				24	#include <cutils/log.h>
				25
				26	#include "codeflinger/GGLAssembler.h"
				27
				28
				29	namespace android {
				30
				31	// ---------------------------------------------------------------------------
				32
				33	// iterators are initialized like this:
				34	// (intToFixedCenter(x) * dx)>>16 + x0
				35	// ((x<<16 + 0x8000) * dx)>>16 + x0
				36	// ((x<<16)dx + (0x8000dx))>>16 + x0
				37	// ( (x*dx) + dx>>1 ) + x0
				38	// (x*dx) + (dx>>1 + x0)
				39
				40	void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
				41	{
				42	context_t const* c = mBuilderContext.c;
				43	const needs_t& needs = mBuilderContext.needs;
				44
				45	if (mSmooth) {
				46	// NOTE: we could take this case in the mDithering + !mSmooth case,
				47	// but this would use up to 4 more registers for the color components
				48	// for only a little added quality.
				49	// Currently, this causes the system to run out of registers in
				50	// some case (see issue #719496)
				51
				52	comment("compute initial iterated color (smooth and/or dither case)");
				53
				54	parts.iterated_packed = 0;
				55	parts.packed = 0;
				56
				57	// 0x1: color component
				58	// 0x2: iterators
				59	const int optReload = mOptLevel >> 1;
				60	if (optReload >= 3) parts.reload = 0; // reload nothing
				61	else if (optReload == 2) parts.reload = 2; // reload iterators
				62	else if (optReload == 1) parts.reload = 1; // reload colors
				63	else if (optReload <= 0) parts.reload = 3; // reload both
				64
				65	if (!mSmooth) {
				66	// we're not smoothing (just dithering), we never have to
				67	// reload the iterators
				68	parts.reload &= ~2;
				69	}
				70
				71	Scratch scratches(registerFile());
				72	const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
				73	const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
				74	for (int i=0 ; i<4 ; i++) {
				75	if (!mInfo[i].iterated)
				76	continue;
				77
				78	// this component exists in the destination and is not replaced
				79	// by a texture unit.
				80	const int c = (parts.reload & 1) ? t0 : obtainReg();
				81	if (i==0) CONTEXT_LOAD(c, iterators.ydady);
				82	if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
				83	if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
				84	if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
				85	parts.argb[i].reg = c;
				86
				87	if (mInfo[i].smooth) {
				88	parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
				89	const int dvdx = parts.argb_dx[i].reg;
				90	CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
				91	MLA(AL, 0, c, x.reg, dvdx, c);
				92
				93	// adjust the color iterator to make sure it won't overflow
				94	if (!mAA) {
				95	// this is not needed when we're using anti-aliasing
				96	// because we will (have to) clamp the components
				97	// anyway.
				98	int end = scratches.obtain();
				99	MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
				100	MLA(AL, 1, end, dvdx, end, c);
				101	SUB(MI, 0, c, c, end);
				102	BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
				103	scratches.recycle(end);
				104	}
				105	}
				106
				107	if (parts.reload & 1) {
				108	CONTEXT_STORE(c, generated_vars.argb[i].c);
				109	}
				110	}
				111	} else {
				112	// We're not smoothed, so we can
				113	// just use a packed version of the color and extract the
				114	// components as needed (or not at all if we don't blend)
				115
				116	// figure out if we need the iterated color
				117	int load = 0;
				118	for (int i=0 ; i<4 ; i++) {
				119	component_info_t& info = mInfo[i];
				120	if ((info.inDest \|\| info.needed) && !info.replaced)
				121	load \|= 1;
				122	}
				123
				124	parts.iterated_packed = 1;
				125	parts.packed = (!mTextureMachine.mask && !mBlending
				126	&& !mFog && !mDithering);
				127	parts.reload = 0;
				128	if (load \|\| parts.packed) {
				129	if (mBlending \|\| mDithering \|\| mInfo[GGLFormat::ALPHA].needed) {
				130	comment("load initial iterated color (8888 packed)");
				131	parts.iterated.setTo(obtainReg(),
				132	&(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
				133	CONTEXT_LOAD(parts.iterated.reg, packed8888);
				134	} else {
				135	comment("load initial iterated color (dest format packed)");
				136
				137	parts.iterated.setTo(obtainReg(), &mCbFormat);
				138
				139	// pre-mask the iterated color
				140	const int bits = parts.iterated.size();
				141	const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
				142	uint32_t mask = 0;
				143	if (mMasking) {
				144	for (int i=0 ; i<4 ; i++) {
				145	const int component_mask = 1<<i;
				146	const int h = parts.iterated.format.c[i].h;
				147	const int l = parts.iterated.format.c[i].l;
				148	if (h && (!(mMasking & component_mask))) {
				149	mask \|= ((1<<(h-l))-1) << l;
				150	}
				151	}
				152	}
				153
				154	if (mMasking && ((mask & size)==0)) {
				155	// none of the components are present in the mask
				156	} else {
				157	CONTEXT_LOAD(parts.iterated.reg, packed);
				158	if (mCbFormat.size == 1) {
				159	AND(AL, 0, parts.iterated.reg,
				160	parts.iterated.reg, imm(0xFF));
				161	} else if (mCbFormat.size == 2) {
				162	MOV(AL, 0, parts.iterated.reg,
				163	reg_imm(parts.iterated.reg, LSR, 16));
				164	}
				165	}
				166
				167	// pre-mask the iterated color
				168	if (mMasking) {
				169	build_and_immediate(parts.iterated.reg, parts.iterated.reg,
				170	mask, bits);
				171	}
				172	}
				173	}
				174	}
				175	}
				176
				177	void GGLAssembler::build_iterated_color(
				178	component_t& fragment,
				179	const fragment_parts_t& parts,
				180	int component,
				181	Scratch& regs)
				182	{
				183	fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
				184
				185	if (!mInfo[component].iterated)
				186	return;
				187
				188	if (parts.iterated_packed) {
				189	// iterated colors are packed, extract the one we need
				190	extract(fragment, parts.iterated, component);
				191	} else {
				192	fragment.h = GGL_COLOR_BITS;
				193	fragment.l = GGL_COLOR_BITS - 8;
				194	fragment.flags \|= CLEAR_LO;
				195	// iterated colors are held in their own register,
				196	// (smooth and/or dithering case)
				197	if (parts.reload==3) {
				198	// this implies mSmooth
				199	Scratch scratches(registerFile());
				200	int dx = scratches.obtain();
				201	CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
				202	CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
				203	ADD(AL, 0, dx, fragment.reg, dx);
				204	CONTEXT_STORE(dx, generated_vars.argb[component].c);
				205	} else if (parts.reload & 1) {
				206	CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
				207	} else {
				208	// we don't reload, so simply rename the register and mark as
				209	// non CORRUPTIBLE so that the texture env or blending code
				210	// won't modify this (renamed) register
				211	regs.recycle(fragment.reg);
				212	fragment.reg = parts.argb[component].reg;
				213	fragment.flags &= ~CORRUPTIBLE;
				214	}
				215	if (mInfo[component].smooth && mAA) {
				216	// when using smooth shading AND anti-aliasing, we need to clamp
				217	// the iterators because there is always an extra pixel on the
				218	// edges, which most of the time will cause an overflow
				219	// (since technically its outside of the domain).
				220	BIC(AL, 0, fragment.reg, fragment.reg,
				221	reg_imm(fragment.reg, ASR, 31));
				222	component_sat(fragment);
				223	}
				224	}
				225	}
				226
				227	// ---------------------------------------------------------------------------
				228
				229	void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
				230	{
				231	// gather some informations about the components we need to process...
				232	const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) \| GGL_CLEAR;
				233	switch(opcode) {
				234	case GGL_COPY:
				235	mLogicOp = 0;
				236	break;
				237	case GGL_CLEAR:
				238	case GGL_SET:
				239	mLogicOp = LOGIC_OP;
				240	break;
				241	case GGL_AND:
				242	case GGL_AND_REVERSE:
				243	case GGL_AND_INVERTED:
				244	case GGL_XOR:
				245	case GGL_OR:
				246	case GGL_NOR:
				247	case GGL_EQUIV:
				248	case GGL_OR_REVERSE:
				249	case GGL_OR_INVERTED:
				250	case GGL_NAND:
				251	mLogicOp = LOGIC_OP\|LOGIC_OP_SRC\|LOGIC_OP_DST;
				252	break;
				253	case GGL_NOOP:
				254	case GGL_INVERT:
				255	mLogicOp = LOGIC_OP\|LOGIC_OP_DST;
				256	break;
				257	case GGL_COPY_INVERTED:
				258	mLogicOp = LOGIC_OP\|LOGIC_OP_SRC;
				259	break;
				260	};
				261	}
				262
				263	void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
				264	{
				265	uint8_t replaced=0;
				266	mTextureMachine.mask = 0;
				267	mTextureMachine.activeUnits = 0;
				268	for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
				269	texture_unit_t& tmu = mTextureMachine.tmu[i];
				270	if (replaced == 0xF) {
				271	// all components are replaced, skip this TMU.
				272	tmu.format_idx = 0;
				273	tmu.mask = 0;
				274	tmu.replaced = replaced;
				275	continue;
				276	}
				277	tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
				278	tmu.format = c->formats[tmu.format_idx];
				279	tmu.bits = tmu.format.size*8;
				280	tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
				281	tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
				282	tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
				283	tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
				284	tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
				285	&& tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
				286
				287	// 5551 linear filtering is not supported
				288	if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
				289	tmu.linear = 0;
				290
				291	tmu.mask = 0;
				292	tmu.replaced = replaced;
				293
				294	if (tmu.format_idx) {
				295	mTextureMachine.activeUnits++;
				296	if (tmu.format.c[0].h) tmu.mask \|= 0x1;
				297	if (tmu.format.c[1].h) tmu.mask \|= 0x2;
				298	if (tmu.format.c[2].h) tmu.mask \|= 0x4;
				299	if (tmu.format.c[3].h) tmu.mask \|= 0x8;
				300	if (tmu.env == GGL_REPLACE) {
				301	replaced \|= tmu.mask;
				302	} else if (tmu.env == GGL_DECAL) {
				303	if (!tmu.format.c[GGLFormat::ALPHA].h) {
				304	// if we don't have alpha, decal does nothing
				305	tmu.mask = 0;
				306	} else {
				307	// decal always ignores At
				308	tmu.mask &= ~(1<<GGLFormat::ALPHA);
				309	}
				310	}
				311	}
				312	mTextureMachine.mask \|= tmu.mask;
				313	//printf("%d: mask=%08lx, replaced=%08lx\n",
				314	// i, int(tmu.mask), int(tmu.replaced));
				315	}
				316	mTextureMachine.replaced = replaced;
				317	mTextureMachine.directTexture = 0;
				318	//printf("replaced=%08lx\n", mTextureMachine.replaced);
				319	}
				320
				321
				322	void GGLAssembler::init_textures(
				323	tex_coord_t* coords,
				324	const reg_t& x, const reg_t& y)
				325	{
				326	context_t const* c = mBuilderContext.c;
				327	const needs_t& needs = mBuilderContext.needs;
				328	int Rctx = mBuilderContext.Rctx;
				329	int Rx = x.reg;
				330	int Ry = y.reg;
				331
				332	if (mTextureMachine.mask) {
				333	comment("compute texture coordinates");
				334	}
				335
				336	// init texture coordinates for each tmu
				337	const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
				338	const bool multiTexture = mTextureMachine.activeUnits > 1;
				339	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				340	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				341	if (tmu.format_idx == 0)
				342	continue;
				343	if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
				344	(tmu.twrap == GGL_NEEDS_WRAP_11))
				345	{
				346	// 1:1 texture
				347	pointer_t& txPtr = coords[i].ptr;
				348	txPtr.setTo(obtainReg(), tmu.bits);
				349	CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
				350	ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16)
				351	CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
				352	ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16)
				353	// merge base & offset
				354	CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
				355	SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride
				356	CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
				357	base_offset(txPtr, txPtr, Rx);
				358	} else {
				359	Scratch scratches(registerFile());
				360	reg_t& s = coords[i].s;
				361	reg_t& t = coords[i].t;
				362	// s = (x * dsdx)>>16 + ydsdy
				363	// s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
				364	// t = (x * dtdx)>>16 + ydtdy
				365	// t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
				366	s.setTo(obtainReg());
				367	t.setTo(obtainReg());
				368	const int need_w = GGL_READ_NEEDS(W, needs.n);
				369	if (need_w) {
				370	CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
				371	CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
				372	} else {
				373	int ydsdy = scratches.obtain();
				374	int ydtdy = scratches.obtain();
				375	CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
				376	CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
				377	CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
				378	CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
				379	MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
				380	MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
				381	}
				382
				383	if ((mOptLevel&1)==0) {
				384	CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
				385	CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
				386	recycleReg(s.reg);
				387	recycleReg(t.reg);
				388	}
				389	}
				390
				391	// direct texture?
				392	if (!multiTexture && !mBlending && !mDithering && !mFog &&
				393	cb_format_idx == tmu.format_idx && !tmu.linear &&
				394	mTextureMachine.replaced == tmu.mask)
				395	{
				396	mTextureMachine.directTexture = i + 1;
				397	}
				398	}
				399	}
				400
				401	void GGLAssembler::build_textures( fragment_parts_t& parts,
				402	Scratch& regs)
				403	{
				404	context_t const* c = mBuilderContext.c;
				405	const needs_t& needs = mBuilderContext.needs;
				406	int Rctx = mBuilderContext.Rctx;
				407
				408	// We don't have a way to spill registers automatically
				409	// spill depth and AA regs, when we know we may have to.
				410	// build the spill list...
				411	uint32_t spill_list = 0;
				412	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				413	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				414	if (tmu.format_idx == 0)
				415	continue;
				416	if (tmu.linear) {
				417	// we may run out of register if we have linear filtering
				418	// at 1 or 4 bytes / pixel on any texture unit.
				419	if (tmu.format.size == 1) {
				420	// if depth and AA enabled, we'll run out of 1 register
				421	if (parts.z.reg > 0 && parts.covPtr.reg > 0)
				422	spill_list \|= 1<<parts.covPtr.reg;
				423	}
				424	if (tmu.format.size == 4) {
				425	// if depth or AA enabled, we'll run out of 1 or 2 registers
				426	if (parts.z.reg > 0)
				427	spill_list \|= 1<<parts.z.reg;
				428	if (parts.covPtr.reg > 0)
				429	spill_list \|= 1<<parts.covPtr.reg;
				430	}
				431	}
				432	}
				433
				434	Spill spill(registerFile(), *this, spill_list);
				435
				436	const bool multiTexture = mTextureMachine.activeUnits > 1;
				437	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				438	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				439	if (tmu.format_idx == 0)
				440	continue;
				441
				442	pointer_t& txPtr = parts.coords[i].ptr;
				443	pixel_t& texel = parts.texel[i];
				444
				445	// repeat...
				446	if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
				447	(tmu.twrap == GGL_NEEDS_WRAP_11))
				448	{ // 1:1 textures
				449	comment("fetch texel");
				450	texel.setTo(regs.obtain(), &tmu.format);
				451	load(txPtr, texel, WRITE_BACK);
				452	} else {
				453	Scratch scratches(registerFile());
				454	reg_t& s = parts.coords[i].s;
				455	reg_t& t = parts.coords[i].t;
				456	if ((mOptLevel&1)==0) {
				457	comment("reload s/t (multitexture or linear filtering)");
				458	s.reg = scratches.obtain();
				459	t.reg = scratches.obtain();
				460	CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
				461	CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
				462	}
				463
				464	comment("compute repeat/clamp");
				465	int u = scratches.obtain();
				466	int v = scratches.obtain();
				467	int width = scratches.obtain();
				468	int height = scratches.obtain();
				469	int U = 0;
				470	int V = 0;
				471
				472	CONTEXT_LOAD(width, generated_vars.texture[i].width);
				473	CONTEXT_LOAD(height, generated_vars.texture[i].height);
				474
				475	int FRAC_BITS = 0;
				476	if (tmu.linear) {
				477	// linear interpolation
				478	if (tmu.format.size == 1) {
				479	// for 8-bits textures, we can afford
				480	// 7 bits of fractional precision at no
				481	// additional cost (we can't do 8 bits
				482	// because filter8 uses signed 16 bits muls)
				483	FRAC_BITS = 7;
				484	} else if (tmu.format.size == 2) {
				485	// filter16() is internally limited to 4 bits, so:
				486	// FRAC_BITS=2 generates less instructions,
				487	// FRAC_BITS=3,4,5 creates unpleasant artifacts,
				488	// FRAC_BITS=6+ looks good
				489	FRAC_BITS = 6;
				490	} else if (tmu.format.size == 4) {
				491	// filter32() is internally limited to 8 bits, so:
				492	// FRAC_BITS=4 looks good
				493	// FRAC_BITS=5+ looks better, but generates 3 extra ipp
				494	FRAC_BITS = 6;
				495	} else {
				496	// for all other cases we use 4 bits.
				497	FRAC_BITS = 4;
				498	}
				499	}
				500	wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS);
				501	wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
				502
				503	if (tmu.linear) {
				504	comment("compute linear filtering offsets");
				505	// pixel size scale
				506	const int shift = 31 - gglClz(tmu.format.size);
				507	U = scratches.obtain();
				508	V = scratches.obtain();
				509
				510	// sample the texel center
				511	SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
				512	SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
				513
				514	// get the fractionnal part of U,V
				515	AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
				516	AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
				517
				518	// compute width-1 and height-1
				519	SUB(AL, 0, width, width, imm(1));
				520	SUB(AL, 0, height, height, imm(1));
				521
				522	// get the integer part of U,V and clamp/wrap
				523	// and compute offset to the next texel
				524	if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
				525	// u has already been REPEATed
				526	MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
				527	MOV(MI, 0, u, width);
				528	CMP(AL, u, width);
				529	MOV(LT, 0, width, imm(1 << shift));
				530	if (shift)
				531	MOV(GE, 0, width, reg_imm(width, LSL, shift));
				532	RSB(GE, 0, width, width, imm(0));
				533	} else {
				534	// u has not been CLAMPed yet
				535	// algorithm:
				536	// if ((u>>4) >= width)
				537	// u = width<<4
				538	// width = 0
				539	// else
				540	// width = 1<<shift
				541	// u = u>>4; // get integer part
				542	// if (u<0)
				543	// u = 0
				544	// width = 0
				545	// generated_vars.rt = width
				546
				547	CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
				548	MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
				549	MOV(LE, 0, width, imm(0));
				550	MOV(GT, 0, width, imm(1 << shift));
				551	MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
				552	MOV(MI, 0, u, imm(0));
				553	MOV(MI, 0, width, imm(0));
				554	}
				555	CONTEXT_STORE(width, generated_vars.rt);
				556
				557	const int stride = width;
				558	CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
				559	if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
				560	// v has already been REPEATed
				561	MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
				562	MOV(MI, 0, v, height);
				563	CMP(AL, v, height);
				564	MOV(LT, 0, height, imm(1 << shift));
				565	if (shift)
				566	MOV(GE, 0, height, reg_imm(height, LSL, shift));
				567	RSB(GE, 0, height, height, imm(0));
				568	MUL(AL, 0, height, stride, height);
				569	} else {
				570	// u has not been CLAMPed yet
				571	CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
				572	MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
				573	MOV(LE, 0, height, imm(0));
				574	if (shift) {
				575	MOV(GT, 0, height, reg_imm(stride, LSL, shift));
				576	} else {
				577	MOV(GT, 0, height, stride);
				578	}
				579	MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
				580	MOV(MI, 0, v, imm(0));
				581	MOV(MI, 0, height, imm(0));
				582	}
				583	CONTEXT_STORE(height, generated_vars.lb);
				584	}
				585
				586	scratches.recycle(width);
				587	scratches.recycle(height);
				588
				589	// iterate texture coordinates...
				590	comment("iterate s,t");
				591	int dsdx = scratches.obtain();
				592	int dtdx = scratches.obtain();
				593	CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
				594	CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
				595	ADD(AL, 0, s.reg, s.reg, dsdx);
				596	ADD(AL, 0, t.reg, t.reg, dtdx);
				597	if ((mOptLevel&1)==0) {
				598	CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
				599	CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
				600	scratches.recycle(s.reg);
				601	scratches.recycle(t.reg);
				602	}
				603	scratches.recycle(dsdx);
				604	scratches.recycle(dtdx);
				605
				606	// merge base & offset...
				607	comment("merge base & offset");
				608	texel.setTo(regs.obtain(), &tmu.format);
				609	txPtr.setTo(texel.reg, tmu.bits);
				610	int stride = scratches.obtain();
				611	CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
				612	CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
				613	SMLABB(AL, u, v, stride, u); // u+v*stride
				614	base_offset(txPtr, txPtr, u);
				615
				616	// load texel
				617	if (!tmu.linear) {
				618	comment("fetch texel");
				619	load(txPtr, texel, 0);
				620	} else {
				621	// recycle registers we don't need anymore
				622	scratches.recycle(u);
				623	scratches.recycle(v);
				624	scratches.recycle(stride);
				625
				626	comment("fetch texel, bilinear");
				627	switch (tmu.format.size) {
				628	case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				629	case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				630	case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				631	case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				632	}
				633	}
				634	}
				635	}
				636	}
				637
				638	void GGLAssembler::build_iterate_texture_coordinates(
				639	const fragment_parts_t& parts)
				640	{
				641	const bool multiTexture = mTextureMachine.activeUnits > 1;
				642	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				643	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				644	if (tmu.format_idx == 0)
				645	continue;
				646
				647	if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
				648	(tmu.twrap == GGL_NEEDS_WRAP_11))
				649	{ // 1:1 textures
				650	const pointer_t& txPtr = parts.coords[i].ptr;
				651	ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
				652	} else {
				653	Scratch scratches(registerFile());
				654	int s = parts.coords[i].s.reg;
				655	int t = parts.coords[i].t.reg;
				656	if ((mOptLevel&1)==0) {
				657	s = scratches.obtain();
				658	t = scratches.obtain();
				659	CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
				660	CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
				661	}
				662	int dsdx = scratches.obtain();
				663	int dtdx = scratches.obtain();
				664	CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
				665	CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
				666	ADD(AL, 0, s, s, dsdx);
				667	ADD(AL, 0, t, t, dtdx);
				668	if ((mOptLevel&1)==0) {
				669	CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
				670	CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
				671	}
				672	}
				673	}
				674	}
				675
				676	void GGLAssembler::filter8(
				677	const fragment_parts_t& parts,
				678	pixel_t& texel, const texture_unit_t& tmu,
				679	int U, int V, pointer_t& txPtr,
				680	int FRAC_BITS)
				681	{
				682	if (tmu.format.components != GGL_ALPHA &&
				683	tmu.format.components != GGL_LUMINANCE)
				684	{
				685	// this is a packed format, and we don't support
				686	// linear filtering (it's probably RGB 332)
				687	// Should not happen with OpenGL\|ES
				688	LDRB(AL, texel.reg, txPtr.reg);
				689	return;
				690	}
				691
				692	// ------------------------
				693	// about ~22 cycles / pixel
				694	Scratch scratches(registerFile());
				695
				696	int pixel= scratches.obtain();
				697	int d = scratches.obtain();
				698	int u = scratches.obtain();
				699	int k = scratches.obtain();
				700	int rt = scratches.obtain();
				701	int lb = scratches.obtain();
				702
				703	// RB -> U * V
				704
				705	CONTEXT_LOAD(rt, generated_vars.rt);
				706	CONTEXT_LOAD(lb, generated_vars.lb);
				707
				708	int offset = pixel;
				709	ADD(AL, 0, offset, lb, rt);
				710	LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				711	SMULBB(AL, u, U, V);
				712	SMULBB(AL, d, pixel, u);
				713	RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
				714
				715	// LB -> (1-U) * V
				716	RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
				717	LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
				718	SMULBB(AL, u, U, V);
				719	SMLABB(AL, d, pixel, u, d);
				720	SUB(AL, 0, k, k, u);
				721
				722	// LT -> (1-U)*(1-V)
				723	RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
				724	LDRB(AL, pixel, txPtr.reg);
				725	SMULBB(AL, u, U, V);
				726	SMLABB(AL, d, pixel, u, d);
				727
				728	// RT -> U*(1-V)
				729	LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
				730	SUB(AL, 0, u, k, u);
				731	SMLABB(AL, texel.reg, pixel, u, d);
				732
				733	for (int i=0 ; i<4 ; i++) {
				734	if (!texel.format.c[i].h) continue;
				735	texel.format.c[i].h = FRAC_BITS*2+8;
				736	texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
				737	}
				738	texel.format.size = 4;
				739	texel.format.bitsPerPixel = 32;
				740	texel.flags \|= CLEAR_LO;
				741	}
				742
				743	void GGLAssembler::filter16(
				744	const fragment_parts_t& parts,
				745	pixel_t& texel, const texture_unit_t& tmu,
				746	int U, int V, pointer_t& txPtr,
				747	int FRAC_BITS)
				748	{
				749	// compute the mask
				750	// XXX: it would be nice if the mask below could be computed
				751	// automatically.
				752	uint32_t mask = 0;
				753	int shift = 0;
				754	int prec = 0;
				755	switch (tmu.format_idx) {
				756	case GGL_PIXEL_FORMAT_RGB_565:
				757	// source: 00000ggg.ggg00000 \| rrrrr000.000bbbbb
				758	// result: gggggggg.gggrrrrr \| rrrrr0bb.bbbbbbbb
				759	mask = 0x07E0F81F;
				760	shift = 16;
				761	prec = 5;
				762	break;
				763	case GGL_PIXEL_FORMAT_RGBA_4444:
				764	// 0000,1111,0000,1111 \| 0000,1111,0000,1111
				765	mask = 0x0F0F0F0F;
				766	shift = 12;
				767	prec = 4;
				768	break;
				769	case GGL_PIXEL_FORMAT_LA_88:
				770	// 0000,0000,1111,1111 \| 0000,0000,1111,1111
				771	// AALL -> 00AA \| 00LL
				772	mask = 0x00FF00FF;
				773	shift = 8;
				774	prec = 8;
				775	break;
				776	default:
				777	// unsupported format, do something sensical...
				778	LOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
				779	LDRH(AL, texel.reg, txPtr.reg);
				780	return;
				781	}
				782
				783	const int adjust = FRAC_BITS*2 - prec;
				784	const int round = 0;
				785
				786	// update the texel format
				787	texel.format.size = 4;
				788	texel.format.bitsPerPixel = 32;
				789	texel.flags \|= CLEAR_HI\|CLEAR_LO;
				790	for (int i=0 ; i<4 ; i++) {
				791	if (!texel.format.c[i].h) continue;
				792	const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
				793	texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
				794	texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
				795	}
				796
				797	// ------------------------
				798	// about ~40 cycles / pixel
				799	Scratch scratches(registerFile());
				800
				801	int pixel= scratches.obtain();
				802	int d = scratches.obtain();
				803	int u = scratches.obtain();
				804	int k = scratches.obtain();
				805
				806	// RB -> U * V
				807	int offset = pixel;
				808	CONTEXT_LOAD(offset, generated_vars.rt);
				809	CONTEXT_LOAD(u, generated_vars.lb);
				810	ADD(AL, 0, offset, offset, u);
				811
				812	LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
				813	SMULBB(AL, u, U, V);
				814	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				815	build_and_immediate(pixel, pixel, mask, 32);
				816	if (adjust) {
				817	if (round)
				818	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				819	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				820	}
				821	MUL(AL, 0, d, pixel, u);
				822	RSB(AL, 0, k, u, imm(1<<prec));
				823
				824	// LB -> (1-U) * V
				825	CONTEXT_LOAD(offset, generated_vars.lb);
				826	RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
				827	LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
				828	SMULBB(AL, u, U, V);
				829	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				830	build_and_immediate(pixel, pixel, mask, 32);
				831	if (adjust) {
				832	if (round)
				833	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				834	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				835	}
				836	MLA(AL, 0, d, pixel, u, d);
				837	SUB(AL, 0, k, k, u);
				838
				839	// LT -> (1-U)*(1-V)
				840	RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
				841	LDRH(AL, pixel, txPtr.reg);
				842	SMULBB(AL, u, U, V);
				843	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				844	build_and_immediate(pixel, pixel, mask, 32);
				845	if (adjust) {
				846	if (round)
				847	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				848	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				849	}
				850	MLA(AL, 0, d, pixel, u, d);
				851
				852	// RT -> U*(1-V)
				853	CONTEXT_LOAD(offset, generated_vars.rt);
				854	LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
				855	SUB(AL, 0, u, k, u);
				856	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				857	build_and_immediate(pixel, pixel, mask, 32);
				858	MLA(AL, 0, texel.reg, pixel, u, d);
				859	}
				860
				861	void GGLAssembler::filter24(
				862	const fragment_parts_t& parts,
				863	pixel_t& texel, const texture_unit_t& tmu,
				864	int U, int V, pointer_t& txPtr,
				865	int FRAC_BITS)
				866	{
				867	// not supported yet (currently disabled)
				868	load(txPtr, texel, 0);
				869	}
				870
				871	void GGLAssembler::filter32(
				872	const fragment_parts_t& parts,
				873	pixel_t& texel, const texture_unit_t& tmu,
				874	int U, int V, pointer_t& txPtr,
				875	int FRAC_BITS)
				876	{
				877	const int adjust = FRAC_BITS*2 - 8;
				878	const int round = 0;
				879
				880	// ------------------------
				881	// about ~38 cycles / pixel
				882	Scratch scratches(registerFile());
				883
				884	int pixel= scratches.obtain();
				885	int dh = scratches.obtain();
				886	int u = scratches.obtain();
				887	int k = scratches.obtain();
				888
				889	int temp = scratches.obtain();
				890	int dl = scratches.obtain();
				891	int mask = scratches.obtain();
				892
				893	MOV(AL, 0, mask, imm(0xFF));
				894	ORR(AL, 0, mask, mask, imm(0xFF0000));
				895
				896	// RB -> U * V
				897	int offset = pixel;
				898	CONTEXT_LOAD(offset, generated_vars.rt);
				899	CONTEXT_LOAD(u, generated_vars.lb);
				900	ADD(AL, 0, offset, offset, u);
				901
				902	LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				903	SMULBB(AL, u, U, V);
				904	AND(AL, 0, temp, mask, pixel);
				905	if (adjust) {
				906	if (round)
				907	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				908	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				909	}
				910	MUL(AL, 0, dh, temp, u);
				911	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				912	MUL(AL, 0, dl, temp, u);
				913	RSB(AL, 0, k, u, imm(0x100));
				914
				915	// LB -> (1-U) * V
				916	CONTEXT_LOAD(offset, generated_vars.lb);
				917	RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
				918	LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				919	SMULBB(AL, u, U, V);
				920	AND(AL, 0, temp, mask, pixel);
				921	if (adjust) {
				922	if (round)
				923	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				924	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				925	}
				926	MLA(AL, 0, dh, temp, u, dh);
				927	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				928	MLA(AL, 0, dl, temp, u, dl);
				929	SUB(AL, 0, k, k, u);
				930
				931	// LT -> (1-U)*(1-V)
				932	RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
				933	LDR(AL, pixel, txPtr.reg);
				934	SMULBB(AL, u, U, V);
				935	AND(AL, 0, temp, mask, pixel);
				936	if (adjust) {
				937	if (round)
				938	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				939	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				940	}
				941	MLA(AL, 0, dh, temp, u, dh);
				942	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				943	MLA(AL, 0, dl, temp, u, dl);
				944
				945	// RT -> U*(1-V)
				946	CONTEXT_LOAD(offset, generated_vars.rt);
				947	LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				948	SUB(AL, 0, u, k, u);
				949	AND(AL, 0, temp, mask, pixel);
				950	MLA(AL, 0, dh, temp, u, dh);
				951	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				952	MLA(AL, 0, dl, temp, u, dl);
				953
				954	AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
				955	AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
				956	ORR(AL, 0, texel.reg, dh, dl);
				957	}
				958
				959	void GGLAssembler::build_texture_environment(
				960	component_t& fragment,
				961	const fragment_parts_t& parts,
				962	int component,
				963	Scratch& regs)
				964	{
				965	const uint32_t component_mask = 1<<component;
				966	const bool multiTexture = mTextureMachine.activeUnits > 1;
				967	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
				968	texture_unit_t& tmu = mTextureMachine.tmu[i];
				969
				970	if (tmu.mask & component_mask) {
				971	// replace or modulate with this texture
				972	if ((tmu.replaced & component_mask) == 0) {
				973	// not replaced by a later tmu...
				974
				975	Scratch scratches(registerFile());
				976	pixel_t texel(parts.texel[i]);
				977	if (multiTexture &&
				978	tmu.swrap == GGL_NEEDS_WRAP_11 &&
				979	tmu.twrap == GGL_NEEDS_WRAP_11)
				980	{
				981	texel.reg = scratches.obtain();
				982	texel.flags \|= CORRUPTIBLE;
				983	comment("fetch texel (multitexture 1:1)");
				984	load(parts.coords[i].ptr, texel, WRITE_BACK);
				985	}
				986
				987	component_t incoming(fragment);
				988	modify(fragment, regs);
				989
				990	switch (tmu.env) {
				991	case GGL_REPLACE:
				992	extract(fragment, texel, component);
				993	break;
				994	case GGL_MODULATE:
				995	modulate(fragment, incoming, texel, component);
				996	break;
				997	case GGL_DECAL:
				998	decal(fragment, incoming, texel, component);
				999	break;
				1000	case GGL_BLEND:
				1001	blend(fragment, incoming, texel, component, i);
				1002	break;
The Android Open Source Project	35237d1	2008-12-17 18:08:08 -0800	[diff] [blame^]	1003	case GGL_ADD:
				1004	add(fragment, incoming, texel, component);
				1005	break;
The Android Open Source Project	4f6e8d7	2008-10-21 07:00:00 -0700	[diff] [blame]	1006	}
				1007	}
				1008	}
				1009	}
				1010	}
				1011
				1012	// ---------------------------------------------------------------------------
				1013
				1014	void GGLAssembler::wrapping(
				1015	int d,
				1016	int coord, int size,
				1017	int tx_wrap, int tx_linear)
				1018	{
				1019	// notes:
				1020	// if tx_linear is set, we need 4 extra bits of precision on the result
				1021	// SMULL/UMULL is 3 cycles
				1022	Scratch scratches(registerFile());
				1023	int c = coord;
				1024	if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
				1025	// UMULL takes 4 cycles (interlocked), and we can get away with
				1026	// 2 cycles using SMULWB, but we're loosing 16 bits of precision
				1027	// out of 32 (this is not a problem because the iterator keeps
				1028	// its full precision)
				1029	// UMULL(AL, 0, size, d, c, size);
				1030	// note: we can't use SMULTB because it's signed.
				1031	MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
				1032	SMULWB(AL, d, d, size);
				1033	} else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
				1034	if (tx_linear) {
				1035	// 1 cycle
				1036	MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
				1037	} else {
				1038	// 4 cycles (common case)
				1039	MOV(AL, 0, d, reg_imm(coord, ASR, 16));
				1040	BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
				1041	CMP(AL, d, size);
				1042	SUB(GE, 0, d, size, imm(1));
				1043	}
				1044	}
				1045	}
				1046
				1047	// ---------------------------------------------------------------------------
				1048
				1049	void GGLAssembler::modulate(
				1050	component_t& dest,
				1051	const component_t& incoming,
				1052	const pixel_t& incomingTexel, int component)
				1053	{
				1054	Scratch locals(registerFile());
				1055	integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
				1056	extract(texel, incomingTexel, component);
				1057
				1058	const int Nt = texel.size();
				1059	// Nt should always be less than 10 bits because it comes
				1060	// from the TMU.
				1061
				1062	int Ni = incoming.size();
				1063	// Ni could be big because it comes from previous MODULATEs
				1064
				1065	if (Nt == 1) {
				1066	// texel acts as a bit-mask
				1067	// dest = incoming & ((texel << incoming.h)-texel)
				1068	RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
				1069	AND(AL, 0, dest.reg, dest.reg, incoming.reg);
				1070	dest.l = incoming.l;
				1071	dest.h = incoming.h;
				1072	dest.flags \|= (incoming.flags & CLEAR_LO);
				1073	} else if (Ni == 1) {
				1074	MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
				1075	AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
				1076	dest.l = 0;
				1077	dest.h = Nt;
				1078	} else {
				1079	int inReg = incoming.reg;
				1080	int shift = incoming.l;
				1081	if ((Nt + Ni) > 32) {
				1082	// we will overflow, reduce the precision of Ni to 8 bits
				1083	// (Note Nt cannot be more than 10 bits which happens with
				1084	// 565 textures and GGL_LINEAR)
				1085	shift += Ni-8;
				1086	Ni = 8;
				1087	}
				1088
				1089	// modulate by the component with the lowest precision
				1090	if (Nt >= Ni) {
				1091	if (shift) {
				1092	// XXX: we should be able to avoid this shift
				1093	// when shift==16 && Nt<16 && Ni<16, in which
				1094	// we could use SMULBT below.
				1095	MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
				1096	inReg = dest.reg;
				1097	shift = 0;
				1098	}
				1099	// operation: (Cf*Ct)/((1<<Ni)-1)
				1100	// approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni
				1101	// this operation doesn't change texel's size
				1102	ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
				1103	if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
				1104	else MUL(AL, 0, dest.reg, texel.reg, dest.reg);
				1105	dest.l = Ni;
				1106	dest.h = Nt + Ni;
				1107	} else {
				1108	if (shift && (shift != 16)) {
				1109	// if shift==16, we can use 16-bits mul instructions later
				1110	MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
				1111	inReg = dest.reg;
				1112	shift = 0;
				1113	}
				1114	// operation: (Cf*Ct)/((1<<Nt)-1)
				1115	// approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt
				1116	// this operation doesn't change incoming's size
				1117	Scratch scratches(registerFile());
				1118	int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
				1119	if (t == inReg)
				1120	t = scratches.obtain();
				1121	ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
				1122	if (Nt<16 && Ni<16) {
				1123	if (shift==16) SMULBT(AL, dest.reg, t, inReg);
				1124	else SMULBB(AL, dest.reg, t, inReg);
				1125	} else MUL(AL, 0, dest.reg, t, inReg);
				1126	dest.l = Nt;
				1127	dest.h = Nt + Ni;
				1128	}
				1129
				1130	// low bits are not valid
				1131	dest.flags \|= CLEAR_LO;
				1132
				1133	// no need to keep more than 8 bits/component
				1134	if (dest.size() > 8)
				1135	dest.l = dest.h-8;
				1136	}
				1137	}
				1138
				1139	void GGLAssembler::decal(
				1140	component_t& dest,
				1141	const component_t& incoming,
				1142	const pixel_t& incomingTexel, int component)
				1143	{
				1144	// RGBA:
				1145	// Cv = Cf(1 - At) + CtAt = Cf + (Ct - Cf)*At
				1146	// Av = Af
				1147	Scratch locals(registerFile());
				1148	integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
				1149	integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
				1150	extract(texel, incomingTexel, component);
				1151	extract(factor, incomingTexel, GGLFormat::ALPHA);
				1152
				1153	// no need to keep more than 8-bits for decal
				1154	int Ni = incoming.size();
				1155	int shift = incoming.l;
				1156	if (Ni > 8) {
				1157	shift += Ni-8;
				1158	Ni = 8;
				1159	}
				1160	integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
				1161	if (shift) {
				1162	MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
				1163	incomingNorm.reg = dest.reg;
				1164	incomingNorm.flags \|= CORRUPTIBLE;
				1165	}
				1166	ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
				1167	build_blendOneMinusFF(dest, factor, incomingNorm, texel);
				1168	}
				1169
				1170	void GGLAssembler::blend(
				1171	component_t& dest,
				1172	const component_t& incoming,
				1173	const pixel_t& incomingTexel, int component, int tmu)
				1174	{
				1175	// RGBA:
				1176	// Cv = (1 - Ct)Cf + CtCc = Cf + (Cc - Cf)*Ct
				1177	// Av = At*Af
				1178
				1179	if (component == GGLFormat::ALPHA) {
				1180	modulate(dest, incoming, incomingTexel, component);
				1181	return;
				1182	}
				1183
				1184	Scratch locals(registerFile());
				1185	integer_t color(locals.obtain(), 8, CORRUPTIBLE);
				1186	integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
				1187	LDRB(AL, color.reg, mBuilderContext.Rctx,
				1188	immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
				1189	extract(factor, incomingTexel, component);
				1190
				1191	// no need to keep more than 8-bits for blend
				1192	int Ni = incoming.size();
				1193	int shift = incoming.l;
				1194	if (Ni > 8) {
				1195	shift += Ni-8;
				1196	Ni = 8;
				1197	}
				1198	integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
				1199	if (shift) {
				1200	MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
				1201	incomingNorm.reg = dest.reg;
				1202	incomingNorm.flags \|= CORRUPTIBLE;
				1203	}
				1204	ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
				1205	build_blendOneMinusFF(dest, factor, incomingNorm, color);
				1206	}
				1207
The Android Open Source Project	35237d1	2008-12-17 18:08:08 -0800	[diff] [blame^]	1208	void GGLAssembler::add(
				1209	component_t& dest,
				1210	const component_t& incoming,
				1211	const pixel_t& incomingTexel, int component)
				1212	{
				1213	// RGBA:
				1214	// Cv = Cf + Ct;
				1215	Scratch locals(registerFile());
				1216
				1217	component_t incomingTemp(incoming);
				1218
				1219	// use "dest" as a temporary for extracting the texel, unless "dest"
				1220	// overlaps "incoming".
				1221	integer_t texel(dest.reg, 32, CORRUPTIBLE);
				1222	if (dest.reg == incomingTemp.reg)
				1223	texel.reg = locals.obtain();
				1224	extract(texel, incomingTexel, component);
				1225
				1226	if (texel.s < incomingTemp.size()) {
				1227	expand(texel, texel, incomingTemp.size());
				1228	} else if (texel.s > incomingTemp.size()) {
				1229	if (incomingTemp.flags & CORRUPTIBLE) {
				1230	expand(incomingTemp, incomingTemp, texel.s);
				1231	} else {
				1232	incomingTemp.reg = locals.obtain();
				1233	expand(incomingTemp, incoming, texel.s);
				1234	}
				1235	}
				1236
				1237	if (incomingTemp.l) {
				1238	ADD(AL, 0, dest.reg, texel.reg,
				1239	reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
				1240	} else {
				1241	ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
				1242	}
				1243	dest.l = 0;
				1244	dest.h = texel.size();
				1245	component_sat(dest);
				1246	}
				1247
The Android Open Source Project	4f6e8d7	2008-10-21 07:00:00 -0700	[diff] [blame]	1248	// ----------------------------------------------------------------------------
				1249
				1250	}; // namespace android
				1251