Blame - libpixelflinger/scanline.cpp - android_system_core

blob: d24c988051261a711f709cc7c49a9e700ef7f9be [file] [log] [blame]

The Android Open Source Project	4f6e8d7	2008-10-21 07:00:00 -0700	[diff] [blame^]	1	/* libs/pixelflinger/scanline.cpp
				2	**
				3	** Copyright 2006, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
				18
				19	#define LOG_TAG "pixelflinger"
				20
				21	#include <assert.h>
				22	#include <stdlib.h>
				23	#include <stdio.h>
				24	#include <string.h>
				25
				26	#include <cutils/memory.h>
				27	#include <cutils/log.h>
				28
				29	#include "buffer.h"
				30	#include "scanline.h"
				31
				32	#include "codeflinger/CodeCache.h"
				33	#include "codeflinger/GGLAssembler.h"
				34	#include "codeflinger/ARMAssembler.h"
				35	//#include "codeflinger/ARMAssemblerOptimizer.h"
				36
				37	// ----------------------------------------------------------------------------
				38
				39	#define ANDROID_CODEGEN_GENERIC 0 // force generic pixel pipeline
				40	#define ANDROID_CODEGEN_C 1 // hand-written C, fallback generic
				41	#define ANDROID_CODEGEN_ASM 2 // hand-written asm, fallback generic
				42	#define ANDROID_CODEGEN_GENERATED 3 // hand-written asm, fallback codegen
				43
				44	#ifdef NDEBUG
				45	# define ANDROID_RELEASE
				46	# define ANDROID_CODEGEN ANDROID_CODEGEN_GENERATED
				47	#else
				48	# define ANDROID_DEBUG
				49	# define ANDROID_CODEGEN ANDROID_CODEGEN_GENERATED
				50	#endif
				51
				52	#if defined(__arm__)
				53	# define ANDROID_ARM_CODEGEN 1
				54	#else
				55	# define ANDROID_ARM_CODEGEN 0
				56	#endif
				57
				58
				59	#define DEBUG__CODEGEN_ONLY 0
				60
				61	// ----------------------------------------------------------------------------
				62	namespace android {
				63	// ----------------------------------------------------------------------------
				64
				65	static void init_y(context_t*, int32_t);
				66	static void init_y_noop(context_t*, int32_t);
				67	static void init_y_packed(context_t*, int32_t);
				68	static void init_y_error(context_t*, int32_t);
				69
				70	static void step_y__generic(context_t* c);
				71	static void step_y__nop(context_t*);
				72	static void step_y__smooth(context_t* c);
				73	static void step_y__tmu(context_t* c);
				74	static void step_y__w(context_t* c);
				75
				76	static void scanline(context_t* c);
				77	static void scanline_perspective(context_t* c);
				78	static void scanline_perspective_single(context_t* c);
				79	static void scanline_t32cb16blend(context_t* c);
				80	static void scanline_t32cb16(context_t* c);
				81	static void scanline_memcpy(context_t* c);
				82	static void scanline_memset8(context_t* c);
				83	static void scanline_memset16(context_t* c);
				84	static void scanline_memset32(context_t* c);
				85	static void scanline_noop(context_t* c);
				86	static void scanline_set(context_t* c);
				87	static void scanline_clear(context_t* c);
				88
				89	static void rect_generic(context_t* c, size_t yc);
				90	static void rect_memcpy(context_t* c, size_t yc);
				91
				92	extern "C" void scanline_t32cb16blend_arm(uint16_t, uint32_t, size_t);
				93	extern "C" void scanline_t32cb16_arm(uint16_t dst, uint32_t src, size_t ct);
				94
				95	// ----------------------------------------------------------------------------
				96
				97	struct shortcut_t {
				98	needs_filter_t filter;
				99	const char* desc;
				100	void (scanline)(context_t);
				101	void (init_y)(context_t, int32_t);
				102	};
				103
				104	// Keep in sync with needs
				105	static shortcut_t shortcuts[] = {
				106	{ { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
				107	{ 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
				108	"565 fb, 8888 tx, blend", scanline_t32cb16blend, init_y_noop },
				109	{ { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
				110	{ 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
				111	"565 fb, 8888 tx", scanline_t32cb16, init_y_noop },
				112	{ { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
				113	{ 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
				114	"(nop) alpha test", scanline_noop, init_y_noop },
				115	{ { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
				116	{ 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
				117	"(nop) depth test", scanline_noop, init_y_noop },
				118	{ { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
				119	{ 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
				120	"(nop) logic_op", scanline_noop, init_y_noop },
				121	{ { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
				122	{ 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
				123	"(nop) color mask", scanline_noop, init_y_noop },
				124	{ { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
				125	{ 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
				126	"(set) logic_op", scanline_set, init_y_noop },
				127	{ { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
				128	{ 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
				129	"(clear) logic_op", scanline_clear, init_y_noop },
				130	{ { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
				131	{ 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
				132	"(clear) blending 0/0", scanline_clear, init_y_noop },
				133	{ { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
				134	{ 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
				135	"(error) invalid color-buffer format", scanline_noop, init_y_error },
				136	};
				137	static const needs_filter_t noblend1to1 = {
				138	// (disregard dithering, see below)
				139	{ 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
				140	{ 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
				141	};
				142	static const needs_filter_t fill16noblend = {
				143	{ 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
				144	{ 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
				145	};
				146
				147	// ----------------------------------------------------------------------------
				148
				149	#if ANDROID_ARM_CODEGEN
				150	static CodeCache gCodeCache(12 * 1024);
				151
				152	class ScanlineAssembly : public Assembly {
				153	AssemblyKey<needs_t> mKey;
				154	public:
				155	ScanlineAssembly(needs_t needs, size_t size)
				156	: Assembly(size), mKey(needs) { }
				157	const AssemblyKey<needs_t>& key() const { return mKey; }
				158	};
				159	#endif
				160
				161	// ----------------------------------------------------------------------------
				162
				163	void ggl_init_scanline(context_t* c)
				164	{
				165	c->init_y = init_y;
				166	c->step_y = step_y__generic;
				167	c->scanline = scanline;
				168	}
				169
				170	void ggl_uninit_scanline(context_t* c)
				171	{
				172	if (c->state.buffers.coverage)
				173	free(c->state.buffers.coverage);
				174	#if ANDROID_ARM_CODEGEN
				175	if (c->scanline_as)
				176	c->scanline_as->decStrong(c);
				177	#endif
				178	}
				179
				180	// ----------------------------------------------------------------------------
				181
				182	static void pick_scanline(context_t* c)
				183	{
				184	#if (!defined(DEBUG__CODEGEN_ONLY) \|\| (DEBUG__CODEGEN_ONLY == 0))
				185
				186	#if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
				187	c->init_y = init_y;
				188	c->step_y = step_y__generic;
				189	c->scanline = scanline;
				190	return;
				191	#endif
				192
				193	//printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
				194	// c->state.needs.n, c->state.needs.p,
				195	// c->state.needs.t[0], c->state.needs.t[1]);
				196
				197	// first handle the special case that we cannot test with a filter
				198	const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
				199	if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
				200	if (c->state.needs.match(noblend1to1)) {
				201	// this will match regardless of dithering state, since both
				202	// src and dest have the same format anyway, there is no dithering
				203	// to be done.
				204	const GGLFormat* f =
				205	&(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
				206	if ((f->components == GGL_RGB) \|\|
				207	(f->components == GGL_RGBA) \|\|
				208	(f->components == GGL_LUMINANCE) \|\|
				209	(f->components == GGL_LUMINANCE_ALPHA))
				210	{
				211	// format must have all of RGB components
				212	// (so the current color doesn't show through)
				213	c->scanline = scanline_memcpy;
				214	c->init_y = init_y_noop;
				215	return;
				216	}
				217	}
				218	}
				219
				220	if (c->state.needs.match(fill16noblend)) {
				221	c->init_y = init_y_packed;
				222	switch (c->formats[cb_format].size) {
				223	case 1: c->scanline = scanline_memset8; return;
				224	case 2: c->scanline = scanline_memset16; return;
				225	case 4: c->scanline = scanline_memset32; return;
				226	}
				227	}
				228
				229	const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
				230	for (int i=0 ; i<numFilters ; i++) {
				231	if (c->state.needs.match(shortcuts[i].filter)) {
				232	c->scanline = shortcuts[i].scanline;
				233	c->init_y = shortcuts[i].init_y;
				234	return;
				235	}
				236	}
				237
				238	#endif // DEBUG__CODEGEN_ONLY
				239
				240	c->init_y = init_y;
				241	c->step_y = step_y__generic;
				242
				243	#if ANDROID_ARM_CODEGEN
				244	// we're going to have to generate some code...
				245	// here, generate code for our pixel pipeline
				246	const AssemblyKey<needs_t> key(c->state.needs);
				247	sp<Assembly> assembly = gCodeCache.lookup(key);
				248	if (assembly == 0) {
				249	// create a new assembly region
				250	sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs, 1024);
				251	// initialize our assembler
				252	GGLAssembler assembler( new ARMAssembler(a) );
				253	//GGLAssembler assembler(
				254	// new ARMAssemblerOptimizer(new ARMAssembler(a)) );
				255	// generate the scanline code for the given needs
				256	int err = assembler.scanline(c->state.needs, c);
				257	if (ggl_likely(!err)) {
				258	// finally, cache this assembly
				259	err = gCodeCache.cache(a->key(), a);
				260	}
				261	if (ggl_unlikely(err)) {
				262	LOGE("error generating or caching assembly. Reverting to NOP.");
				263	c->scanline = scanline_noop;
				264	c->init_y = init_y_noop;
				265	c->step_y = step_y__nop;
				266	return;
				267	}
				268	assembly = a;
				269	}
				270
				271	// release the previous assembly
				272	if (c->scanline_as) {
				273	c->scanline_as->decStrong(c);
				274	}
				275
				276	//LOGI("using generated pixel-pipeline");
				277	c->scanline_as = assembly.get();
				278	c->scanline_as->incStrong(c); // hold on to assembly
				279	c->scanline = (void()(context_t c))assembly->base();
				280	#else
				281	// LOGW("using generic (slow) pixel-pipeline");
				282	c->scanline = scanline;
				283	#endif
				284	}
				285
				286	void ggl_pick_scanline(context_t* c)
				287	{
				288	pick_scanline(c);
				289	if ((c->state.enables & GGL_ENABLE_W) &&
				290	(c->state.enables & GGL_ENABLE_TMUS))
				291	{
				292	c->span = c->scanline;
				293	c->scanline = scanline_perspective;
				294	if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
				295	// only one TMU enabled
				296	c->scanline = scanline_perspective_single;
				297	}
				298	}
				299	}
				300
				301	// ----------------------------------------------------------------------------
				302
				303	static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
				304	static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
				305	const pixel_t* src, const pixel_t* dst);
				306	static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
				307
				308	#if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
				309
				310	// no need to compile the generic-pipeline, it can't be reached
				311	void scanline(context_t*)
				312	{
				313	}
				314
				315	#else
				316
				317	void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
				318	{
				319	if (su && sv) {
				320	if (su > sv) {
				321	v = ggl_expand(v, sv, su);
				322	sv = su;
				323	} else if (su < sv) {
				324	u = ggl_expand(u, su, sv);
				325	su = sv;
				326	}
				327	}
				328	}
				329
				330	void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
				331	{
				332	rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
				333	rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
				334	rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
				335	rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
				336
				337	pixel_t sf, df;
				338	blend_factor(c, &sf, c->state.blend.src, fragment, fb);
				339	blend_factor(c, &df, c->state.blend.dst, fragment, fb);
				340
				341	fragment->c[1] =
				342	gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
				343	fragment->c[2] =
				344	gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
				345	fragment->c[3] =
				346	gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
				347
				348	if (c->state.blend.alpha_separate) {
				349	blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
				350	blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
				351	}
				352
				353	fragment->c[0] =
				354	gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
				355
				356	// clamp to 1.0
				357	if (fragment->c[0] >= (1LU<<fragment->s[0]))
				358	fragment->c[0] = (1<<fragment->s[0])-1;
				359	if (fragment->c[1] >= (1LU<<fragment->s[1]))
				360	fragment->c[1] = (1<<fragment->s[1])-1;
				361	if (fragment->c[2] >= (1LU<<fragment->s[2]))
				362	fragment->c[2] = (1<<fragment->s[2])-1;
				363	if (fragment->c[3] >= (1LU<<fragment->s[3]))
				364	fragment->c[3] = (1<<fragment->s[3])-1;
				365	}
				366
				367	static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
				368	{
				369	if (!size)
				370	return def;
				371
				372	// scale to 16 bits
				373	if (size > 16) {
				374	x >>= (size - 16);
				375	} else if (size < 16) {
				376	x = ggl_expand(x, size, 16);
				377	}
				378	x += x >> 15;
				379	return x;
				380	}
				381
				382	void blend_factor(context_t* c, pixel_t* r,
				383	uint32_t factor, const pixel_t* src, const pixel_t* dst)
				384	{
				385	switch (factor) {
				386	case GGL_ZERO:
				387	r->c[1] =
				388	r->c[2] =
				389	r->c[3] =
				390	r->c[0] = 0;
				391	break;
				392	case GGL_ONE:
				393	r->c[1] =
				394	r->c[2] =
				395	r->c[3] =
				396	r->c[0] = FIXED_ONE;
				397	break;
				398	case GGL_DST_COLOR:
				399	r->c[1] = blendfactor(dst->c[1], dst->s[1]);
				400	r->c[2] = blendfactor(dst->c[2], dst->s[2]);
				401	r->c[3] = blendfactor(dst->c[3], dst->s[3]);
				402	r->c[0] = blendfactor(dst->c[0], dst->s[0]);
				403	break;
				404	case GGL_SRC_COLOR:
				405	r->c[1] = blendfactor(src->c[1], src->s[1]);
				406	r->c[2] = blendfactor(src->c[2], src->s[2]);
				407	r->c[3] = blendfactor(src->c[3], src->s[3]);
				408	r->c[0] = blendfactor(src->c[0], src->s[0]);
				409	break;
				410	case GGL_ONE_MINUS_DST_COLOR:
				411	r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
				412	r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
				413	r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
				414	r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
				415	break;
				416	case GGL_ONE_MINUS_SRC_COLOR:
				417	r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
				418	r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
				419	r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
				420	r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
				421	break;
				422	case GGL_SRC_ALPHA:
				423	r->c[1] =
				424	r->c[2] =
				425	r->c[3] =
				426	r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
				427	break;
				428	case GGL_ONE_MINUS_SRC_ALPHA:
				429	r->c[1] =
				430	r->c[2] =
				431	r->c[3] =
				432	r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
				433	break;
				434	case GGL_DST_ALPHA:
				435	r->c[1] =
				436	r->c[2] =
				437	r->c[3] =
				438	r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
				439	break;
				440	case GGL_ONE_MINUS_DST_ALPHA:
				441	r->c[1] =
				442	r->c[2] =
				443	r->c[3] =
				444	r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
				445	break;
				446	case GGL_SRC_ALPHA_SATURATE:
				447	// XXX: GGL_SRC_ALPHA_SATURATE
				448	break;
				449	}
				450	}
				451
				452	static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
				453	{
				454	GGLfixed d;
				455	if (tx_wrap == GGL_REPEAT) {
				456	d = (uint32_t(coord)>>16) * size;
				457	} else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
				458	const GGLfixed clamp_min = FIXED_HALF;
				459	const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
				460	if (coord < clamp_min) coord = clamp_min;
				461	if (coord > clamp_max) coord = clamp_max;
				462	d = coord;
				463	} else { // 1:1
				464	const GGLfixed clamp_min = 0;
				465	const GGLfixed clamp_max = (size << 16);
				466	if (coord < clamp_min) coord = clamp_min;
				467	if (coord > clamp_max) coord = clamp_max;
				468	d = coord;
				469	}
				470	return d;
				471	}
				472
				473	static inline
				474	GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
				475	{
				476	const int32_t end = dvdx * (len-1) + v;
				477	if (end < 0)
				478	v -= end;
				479	v &= ~(v>>31);
				480	return v;
				481	}
				482
				483	void scanline(context_t* c)
				484	{
				485	const uint32_t enables = c->state.enables;
				486	const int xs = c->iterators.xl;
				487	const int x1 = c->iterators.xr;
				488	int xc = x1 - xs;
				489	const int16_t* covPtr = c->state.buffers.coverage + xs;
				490
				491	// All iterated values are sampled at the pixel center
				492
				493	// reset iterators for that scanline...
				494	GGLcolor r, g, b, a;
				495	iterators_t& ci = c->iterators;
				496	if (enables & GGL_ENABLE_SMOOTH) {
				497	r = (xs * c->shade.drdx) + ci.ydrdy;
				498	g = (xs * c->shade.dgdx) + ci.ydgdy;
				499	b = (xs * c->shade.dbdx) + ci.ydbdy;
				500	a = (xs * c->shade.dadx) + ci.ydady;
				501	r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
				502	g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
				503	b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
				504	a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
				505	} else {
				506	r = ci.ydrdy;
				507	g = ci.ydgdy;
				508	b = ci.ydbdy;
				509	a = ci.ydady;
				510	}
				511
				512	// z iterators are 1.31
				513	GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
				514	GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
				515
				516	struct {
				517	GGLfixed s, t;
				518	} tc[GGL_TEXTURE_UNIT_COUNT];
				519	if (enables & GGL_ENABLE_TMUS) {
				520	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				521	if (c->state.texture[i].enable) {
				522	texture_iterators_t& ti = c->state.texture[i].iterators;
				523	if (enables & GGL_ENABLE_W) {
				524	tc[i].s = ti.ydsdy;
				525	tc[i].t = ti.ydtdy;
				526	} else {
				527	tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
				528	tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
				529	}
				530	}
				531	}
				532	}
				533
				534	pixel_t fragment;
				535	pixel_t texel;
				536	pixel_t fb;
				537
				538	uint32_t x = xs;
				539	uint32_t y = c->iterators.y;
				540
				541	while (xc--) {
				542
				543	{ // just a scope
				544
				545	// read color (convert to 8 bits by keeping only the integer part)
				546	fragment.s[1] = fragment.s[2] =
				547	fragment.s[3] = fragment.s[0] = 8;
				548	fragment.c[1] = r >> (GGL_COLOR_BITS-8);
				549	fragment.c[2] = g >> (GGL_COLOR_BITS-8);
				550	fragment.c[3] = b >> (GGL_COLOR_BITS-8);
				551	fragment.c[0] = a >> (GGL_COLOR_BITS-8);
				552
				553	// texturing
				554	if (enables & GGL_ENABLE_TMUS) {
				555	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				556	texture_t& tx = c->state.texture[i];
				557	if (!tx.enable)
				558	continue;
				559	texture_iterators_t& ti = tx.iterators;
				560	int32_t u, v;
				561
				562	// s-coordinate
				563	if (tx.s_coord != GGL_ONE_TO_ONE) {
				564	const int w = tx.surface.width;
				565	u = wrapping(tc[i].s, w, tx.s_wrap);
				566	tc[i].s += ti.dsdx;
				567	} else {
				568	u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
				569	}
				570
				571	// t-coordinate
				572	if (tx.t_coord != GGL_ONE_TO_ONE) {
				573	const int h = tx.surface.height;
				574	v = wrapping(tc[i].t, h, tx.t_wrap);
				575	tc[i].t += ti.dtdx;
				576	} else {
				577	v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
				578	}
				579
				580	// read texture
				581	if (tx.mag_filter == GGL_NEAREST &&
				582	tx.min_filter == GGL_NEAREST)
				583	{
				584	u >>= 16;
				585	v >>= 16;
				586	tx.surface.read(&tx.surface, c, u, v, &texel);
				587	} else {
				588	const int w = tx.surface.width;
				589	const int h = tx.surface.height;
				590	u -= FIXED_HALF;
				591	v -= FIXED_HALF;
				592	int u0 = u >> 16;
				593	int v0 = v >> 16;
				594	int u1 = u0 + 1;
				595	int v1 = v0 + 1;
				596	if (tx.s_wrap == GGL_REPEAT) {
				597	if (u0<0) u0 += w;
				598	if (u1<0) u1 += w;
				599	if (u0>=w) u0 -= w;
				600	if (u1>=w) u1 -= w;
				601	} else {
				602	if (u0<0) u0 = 0;
				603	if (u1<0) u1 = 0;
				604	if (u0>=w) u0 = w-1;
				605	if (u1>=w) u1 = w-1;
				606	}
				607	if (tx.t_wrap == GGL_REPEAT) {
				608	if (v0<0) v0 += h;
				609	if (v1<0) v1 += h;
				610	if (v0>=h) v0 -= h;
				611	if (v1>=h) v1 -= h;
				612	} else {
				613	if (v0<0) v0 = 0;
				614	if (v1<0) v1 = 0;
				615	if (v0>=h) v0 = h-1;
				616	if (v1>=h) v1 = h-1;
				617	}
				618	pixel_t texels[4];
				619	uint32_t mm[4];
				620	tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
				621	tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
				622	tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
				623	tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
				624	u = (u >> 12) & 0xF;
				625	v = (v >> 12) & 0xF;
				626	u += u>>3;
				627	v += v>>3;
				628	mm[0] = (0x10 - u) * (0x10 - v);
				629	mm[1] = (0x10 - u) * v;
				630	mm[2] = u * (0x10 - v);
				631	mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
				632	for (int j=0 ; j<4 ; j++) {
				633	texel.s[j] = texels[0].s[j];
				634	if (!texel.s[j]) continue;
				635	texel.s[j] += 8;
				636	texel.c[j] = texels[0].c[j]*mm[0] +
				637	texels[1].c[j]*mm[1] +
				638	texels[2].c[j]*mm[2] +
				639	texels[3].c[j]*mm[3] ;
				640	}
				641	}
				642
				643	// Texture environnement...
				644	for (int j=0 ; j<4 ; j++) {
				645	uint32_t& Cf = fragment.c[j];
				646	uint32_t& Ct = texel.c[j];
				647	uint8_t& sf = fragment.s[j];
				648	uint8_t& st = texel.s[j];
				649	uint32_t At = texel.c[0];
				650	uint8_t sat = texel.s[0];
				651	switch (tx.env) {
				652	case GGL_REPLACE:
				653	if (st) {
				654	Cf = Ct;
				655	sf = st;
				656	}
				657	break;
				658	case GGL_MODULATE:
				659	if (st) {
				660	uint32_t factor = Ct + (Ct>>(st-1));
				661	Cf = (Cf * factor) >> st;
				662	}
				663	break;
				664	case GGL_DECAL:
				665	if (sat) {
				666	rescale(Cf, sf, Ct, st);
				667	Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
				668	}
				669	break;
				670	case GGL_BLEND:
				671	if (st) {
				672	uint32_t Cc = tx.env_color[i];
				673	if (sf>8) Cc = (Cc * ((1<<sf)-1))>>8;
				674	else if (sf<8) Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
				675	uint32_t factor = Ct + (Ct>>(st-1));
				676	Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
				677	}
				678	break;
				679	}
				680	}
				681	}
				682	}
				683
				684	// coverage application
				685	if (enables & GGL_ENABLE_AA) {
				686	int16_t cf = *covPtr++;
				687	fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
				688	}
				689
				690	// alpha-test
				691	if (enables & GGL_ENABLE_ALPHA_TEST) {
				692	GGLcolor ref = c->state.alpha_test.ref;
				693	GGLcolor alpha = (uint64_t(fragment.c[0]) *
				694	((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
				695	switch (c->state.alpha_test.func) {
				696	case GGL_NEVER: goto discard;
				697	case GGL_LESS: if (alpha<ref) break; goto discard;
				698	case GGL_EQUAL: if (alpha==ref) break; goto discard;
				699	case GGL_LEQUAL: if (alpha<=ref) break; goto discard;
				700	case GGL_GREATER: if (alpha>ref) break; goto discard;
				701	case GGL_NOTEQUAL: if (alpha!=ref) break; goto discard;
				702	case GGL_GEQUAL: if (alpha>=ref) break; goto discard;
				703	}
				704	}
				705
				706	// depth test
				707	if (c->state.buffers.depth.format) {
				708	if (enables & GGL_ENABLE_DEPTH_TEST) {
				709	surface_t* cb = &(c->state.buffers.depth);
				710	uint16_t* p = (uint16_t)(cb->data)+(x+(cb->stridey));
				711	uint16_t zz = uint32_t(z)>>(16);
				712	uint16_t depth = *p;
				713	switch (c->state.depth_test.func) {
				714	case GGL_NEVER: goto discard;
				715	case GGL_LESS: if (zz<depth) break; goto discard;
				716	case GGL_EQUAL: if (zz==depth) break; goto discard;
				717	case GGL_LEQUAL: if (zz<=depth) break; goto discard;
				718	case GGL_GREATER: if (zz>depth) break; goto discard;
				719	case GGL_NOTEQUAL: if (zz!=depth) break; goto discard;
				720	case GGL_GEQUAL: if (zz>=depth) break; goto discard;
				721	}
				722	// depth buffer is not enabled, if depth-test is not enabled
				723	/*
				724	fragment.s[1] = fragment.s[2] =
				725	fragment.s[3] = fragment.s[0] = 8;
				726	fragment.c[1] =
				727	fragment.c[2] =
				728	fragment.c[3] =
				729	fragment.c[0] = 255 - (zz>>8);
				730	*/
				731	if (c->state.mask.depth) {
				732	*p = zz;
				733	}
				734	}
				735	}
				736
				737	// fog
				738	if (enables & GGL_ENABLE_FOG) {
				739	for (int i=1 ; i<=3 ; i++) {
				740	GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
				741	uint32_t& c = fragment.c[i];
				742	uint8_t& s = fragment.s[i];
				743	c = (c * 0x10000) / ((1<<s)-1);
				744	c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
				745	s = 16;
				746	}
				747	}
				748
				749	// blending
				750	if (enables & GGL_ENABLE_BLENDING) {
				751	fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
				752	fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
				753	c->state.buffers.color.read(
				754	&(c->state.buffers.color), c, x, y, &fb);
				755	blending( c, &fragment, &fb );
				756	}
				757
				758	// write
				759	c->state.buffers.color.write(
				760	&(c->state.buffers.color), c, x, y, &fragment);
				761	}
				762
				763	discard:
				764	// iterate...
				765	x += 1;
				766	if (enables & GGL_ENABLE_SMOOTH) {
				767	r += c->shade.drdx;
				768	g += c->shade.dgdx;
				769	b += c->shade.dbdx;
				770	a += c->shade.dadx;
				771	}
				772	z += c->shade.dzdx;
				773	f += c->shade.dfdx;
				774	}
				775	}
				776
				777	#endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
				778
				779	// ----------------------------------------------------------------------------
				780	#if 0
				781	#pragma mark -
				782	#pragma mark Scanline
				783	#endif
				784
				785	template <typename T, typename U>
				786	static inline __attribute__((const))
				787	T interpolate(int y, T v0, U dvdx, U dvdy) {
				788	// interpolates in pixel's centers
				789	// v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
				790	return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
				791	}
				792
				793	// ----------------------------------------------------------------------------
				794	#if 0
				795	#pragma mark -
				796	#endif
				797
				798	void init_y(context_t* c, int32_t ys)
				799	{
				800	const uint32_t enables = c->state.enables;
				801
				802	// compute iterators...
				803	iterators_t& ci = c->iterators;
				804
				805	// sample in the center
				806	ci.y = ys;
				807
				808	if (enables & (GGL_ENABLE_DEPTH_TEST\|GGL_ENABLE_W\|GGL_ENABLE_FOG)) {
				809	ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
				810	ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
				811	ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
				812	}
				813
				814	if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
				815	ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
				816	ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
				817	ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
				818	ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
				819	c->step_y = step_y__smooth;
				820	} else {
				821	ci.ydrdy = c->shade.r0;
				822	ci.ydgdy = c->shade.g0;
				823	ci.ydbdy = c->shade.b0;
				824	ci.ydady = c->shade.a0;
				825	// XXX: do only if needed, or make sure this is fast
				826	c->packed = ggl_pack_color(c, c->state.buffers.color.format,
				827	ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
				828	c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
				829	ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
				830	}
				831
				832	// initialize the variables we need in the shader
				833	generated_vars_t& gen = c->generated_vars;
				834	gen.argb[GGLFormat::ALPHA].c = ci.ydady;
				835	gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
				836	gen.argb[GGLFormat::RED ].c = ci.ydrdy;
				837	gen.argb[GGLFormat::RED ].dx = c->shade.drdx;
				838	gen.argb[GGLFormat::GREEN].c = ci.ydgdy;
				839	gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
				840	gen.argb[GGLFormat::BLUE ].c = ci.ydbdy;
				841	gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
				842	gen.dzdx = c->shade.dzdx;
				843	gen.f = ci.ydfdy;
				844	gen.dfdx = c->shade.dfdx;
				845
				846	if (enables & GGL_ENABLE_TMUS) {
				847	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				848	texture_t& t = c->state.texture[i];
				849	if (!t.enable) continue;
				850
				851	texture_iterators_t& ti = t.iterators;
				852	if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
				853	// we need to set all of these to 0 because in some cases
				854	// step_y__generic() or step_y__tmu() will be used and
				855	// therefore will update dtdy, however, in 1:1 mode
				856	// this is always done by the scanline rasterizer.
				857	ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
				858	ti.ydsdy = t.shade.is0;
				859	ti.ydtdy = t.shade.it0;
				860	} else {
				861	const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
				862	const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
				863	ti.sscale = t.shade.sscale + adjustSWrap;
				864	ti.tscale = t.shade.tscale + adjustTWrap;
				865	if (!(enables & GGL_ENABLE_W)) {
				866	// S coordinate
				867	const int32_t sscale = ti.sscale;
				868	const int32_t sy = interpolate(ys,
				869	t.shade.is0, t.shade.idsdx, t.shade.idsdy);
				870	if (sscale>=0) {
				871	ti.ydsdy= sy << sscale;
				872	ti.dsdx = t.shade.idsdx << sscale;
				873	ti.dsdy = t.shade.idsdy << sscale;
				874	} else {
				875	ti.ydsdy= sy >> -sscale;
				876	ti.dsdx = t.shade.idsdx >> -sscale;
				877	ti.dsdy = t.shade.idsdy >> -sscale;
				878	}
				879	// T coordinate
				880	const int32_t tscale = ti.tscale;
				881	const int32_t ty = interpolate(ys,
				882	t.shade.it0, t.shade.idtdx, t.shade.idtdy);
				883	if (tscale>=0) {
				884	ti.ydtdy= ty << tscale;
				885	ti.dtdx = t.shade.idtdx << tscale;
				886	ti.dtdy = t.shade.idtdy << tscale;
				887	} else {
				888	ti.ydtdy= ty >> -tscale;
				889	ti.dtdx = t.shade.idtdx >> -tscale;
				890	ti.dtdy = t.shade.idtdy >> -tscale;
				891	}
				892	}
				893	}
				894	// mirror for generated code...
				895	generated_tex_vars_t& gen = c->generated_vars.texture[i];
				896	gen.width = t.surface.width;
				897	gen.height = t.surface.height;
				898	gen.stride = t.surface.stride;
				899	gen.data = int32_t(t.surface.data);
				900	gen.dsdx = ti.dsdx;
				901	gen.dtdx = ti.dtdx;
				902	}
				903	}
				904
				905	// choose the y-stepper
				906	c->step_y = step_y__nop;
				907	if (enables & GGL_ENABLE_FOG) {
				908	c->step_y = step_y__generic;
				909	} else if (enables & GGL_ENABLE_TMUS) {
				910	if (enables & GGL_ENABLE_SMOOTH) {
				911	c->step_y = step_y__generic;
				912	} else if (enables & GGL_ENABLE_W) {
				913	c->step_y = step_y__w;
				914	} else {
				915	c->step_y = step_y__tmu;
				916	}
				917	} else {
				918	if (enables & GGL_ENABLE_SMOOTH) {
				919	c->step_y = step_y__smooth;
				920	}
				921	}
				922
				923	// choose the rectangle blitter
				924	c->rect = rect_generic;
				925	if ((c->step_y == step_y__nop) &&
				926	(c->scanline == scanline_memcpy))
				927	{
				928	c->rect = rect_memcpy;
				929	}
				930	}
				931
				932	void init_y_packed(context_t* c, int32_t y0)
				933	{
				934	uint8_t f = c->state.buffers.color.format;
				935	c->packed = ggl_pack_color(c, f,
				936	c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
				937	c->iterators.y = y0;
				938	c->step_y = step_y__nop;
				939	// choose the rectangle blitter
				940	c->rect = rect_generic;
				941	if (c->scanline == scanline_memcpy) {
				942	c->rect = rect_memcpy;
				943	}
				944	}
				945
				946	void init_y_noop(context_t* c, int32_t y0)
				947	{
				948	c->iterators.y = y0;
				949	c->step_y = step_y__nop;
				950	// choose the rectangle blitter
				951	c->rect = rect_generic;
				952	if (c->scanline == scanline_memcpy) {
				953	c->rect = rect_memcpy;
				954	}
				955	}
				956
				957	void init_y_error(context_t* c, int32_t y0)
				958	{
				959	// woooops, shoud never happen,
				960	// fail gracefully (don't display anything)
				961	init_y_noop(c, y0);
				962	LOGE("color-buffer has an invalid format!");
				963	}
				964
				965	// ----------------------------------------------------------------------------
				966	#if 0
				967	#pragma mark -
				968	#endif
				969
				970	void step_y__generic(context_t* c)
				971	{
				972	const uint32_t enables = c->state.enables;
				973
				974	// iterate...
				975	iterators_t& ci = c->iterators;
				976	ci.y += 1;
				977
				978	if (enables & GGL_ENABLE_SMOOTH) {
				979	ci.ydrdy += c->shade.drdy;
				980	ci.ydgdy += c->shade.dgdy;
				981	ci.ydbdy += c->shade.dbdy;
				982	ci.ydady += c->shade.dady;
				983	}
				984
				985	const uint32_t mask =
				986	GGL_ENABLE_DEPTH_TEST \|
				987	GGL_ENABLE_W \|
				988	GGL_ENABLE_FOG;
				989	if (enables & mask) {
				990	ci.ydzdy += c->shade.dzdy;
				991	ci.ydwdy += c->shade.dwdy;
				992	ci.ydfdy += c->shade.dfdy;
				993	}
				994
				995	if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
				996	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				997	if (c->state.texture[i].enable) {
				998	texture_iterators_t& ti = c->state.texture[i].iterators;
				999	ti.ydsdy += ti.dsdy;
				1000	ti.ydtdy += ti.dtdy;
				1001	}
				1002	}
				1003	}
				1004	}
				1005
				1006	void step_y__nop(context_t* c)
				1007	{
				1008	c->iterators.y += 1;
				1009	c->iterators.ydzdy += c->shade.dzdy;
				1010	}
				1011
				1012	void step_y__smooth(context_t* c)
				1013	{
				1014	iterators_t& ci = c->iterators;
				1015	ci.y += 1;
				1016	ci.ydrdy += c->shade.drdy;
				1017	ci.ydgdy += c->shade.dgdy;
				1018	ci.ydbdy += c->shade.dbdy;
				1019	ci.ydady += c->shade.dady;
				1020	ci.ydzdy += c->shade.dzdy;
				1021	}
				1022
				1023	void step_y__w(context_t* c)
				1024	{
				1025	iterators_t& ci = c->iterators;
				1026	ci.y += 1;
				1027	ci.ydzdy += c->shade.dzdy;
				1028	ci.ydwdy += c->shade.dwdy;
				1029	}
				1030
				1031	void step_y__tmu(context_t* c)
				1032	{
				1033	iterators_t& ci = c->iterators;
				1034	ci.y += 1;
				1035	ci.ydzdy += c->shade.dzdy;
				1036	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				1037	if (c->state.texture[i].enable) {
				1038	texture_iterators_t& ti = c->state.texture[i].iterators;
				1039	ti.ydsdy += ti.dsdy;
				1040	ti.ydtdy += ti.dtdy;
				1041	}
				1042	}
				1043	}
				1044
				1045	// ----------------------------------------------------------------------------
				1046	#if 0
				1047	#pragma mark -
				1048	#endif
				1049
				1050	void scanline_perspective(context_t* c)
				1051	{
				1052	struct {
				1053	union {
				1054	struct {
				1055	int32_t s, sq;
				1056	int32_t t, tq;
				1057	};
				1058	struct {
				1059	int32_t v, q;
				1060	} st[2];
				1061	};
				1062	} tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
				1063
				1064	// XXX: we should have a special case when dwdx = 0
				1065
				1066	// 32 pixels spans works okay. 16 is a lot better,
				1067	// but hey, it's a software renderer...
				1068	const uint32_t SPAN_BITS = 5;
				1069	const uint32_t ys = c->iterators.y;
				1070	const uint32_t xs = c->iterators.xl;
				1071	const uint32_t x1 = c->iterators.xr;
				1072	const uint32_t xc = x1 - xs;
				1073	uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
				1074	uint32_t numSpans = xc >> SPAN_BITS;
				1075
				1076	const iterators_t& ci = c->iterators;
				1077	int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
				1078	int32_t q0 = gglRecipQ(w0, 30);
				1079	const int iwscale = 32 - gglClz(q0);
				1080
				1081	const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
				1082	int32_t xl = c->iterators.xl;
				1083
				1084	// We process s & t with a loop to reduce the code size
				1085	// (and i-cache pressure).
				1086
				1087	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				1088	const texture_t& tmu = c->state.texture[i];
				1089	if (!tmu.enable) continue;
				1090	int32_t s = tmu.shade.is0 +
				1091	(tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
				1092	((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
				1093	int32_t t = tmu.shade.it0 +
				1094	(tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
				1095	((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
				1096	tc[i].s = s;
				1097	tc[i].t = t;
				1098	tc[i].sq = gglMulx(s, q0, iwscale);
				1099	tc[i].tq = gglMulx(t, q0, iwscale);
				1100	}
				1101
				1102	int32_t span = 0;
				1103	do {
				1104	int32_t w1;
				1105	if (ggl_likely(numSpans)) {
				1106	w1 = w0 + dwdx;
				1107	} else {
				1108	if (remainder) {
				1109	// finish off the scanline...
				1110	span = remainder;
				1111	w1 = (c->shade.dwdx * span) + w0;
				1112	} else {
				1113	break;
				1114	}
				1115	}
				1116	int32_t q1 = gglRecipQ(w1, 30);
				1117	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
				1118	texture_t& tmu = c->state.texture[i];
				1119	if (!tmu.enable) continue;
				1120	texture_iterators_t& ti = tmu.iterators;
				1121
				1122	for (int j=0 ; j<2 ; j++) {
				1123	int32_t v = tc[i].st[j].v;
				1124	if (span) v += (tmu.shade.st[j].dx)*span;
				1125	else v += (tmu.shade.st[j].dx)<<SPAN_BITS;
				1126	const int32_t v0 = tc[i].st[j].q;
				1127	const int32_t v1 = gglMulx(v, q1, iwscale);
				1128	int32_t dvdx = v1 - v0;
				1129	if (span) dvdx /= span;
				1130	else dvdx >>= SPAN_BITS;
				1131	tc[i].st[j].v = v;
				1132	tc[i].st[j].q = v1;
				1133
				1134	const int scale = ti.st[j].scale + (iwscale - 30);
				1135	if (scale >= 0) {
				1136	ti.st[j].ydvdy = v0 << scale;
				1137	ti.st[j].dvdx = dvdx << scale;
				1138	} else {
				1139	ti.st[j].ydvdy = v0 >> -scale;
				1140	ti.st[j].dvdx = dvdx >> -scale;
				1141	}
				1142	}
				1143	generated_tex_vars_t& gen = c->generated_vars.texture[i];
				1144	gen.dsdx = ti.st[0].dvdx;
				1145	gen.dtdx = ti.st[1].dvdx;
				1146	}
				1147	c->iterators.xl = xl;
				1148	c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
				1149	w0 = w1;
				1150	q0 = q1;
				1151	c->span(c);
				1152	} while(numSpans--);
				1153	}
				1154
				1155	void scanline_perspective_single(context_t* c)
				1156	{
				1157	// 32 pixels spans works okay. 16 is a lot better,
				1158	// but hey, it's a software renderer...
				1159	const uint32_t SPAN_BITS = 5;
				1160	const uint32_t ys = c->iterators.y;
				1161	const uint32_t xs = c->iterators.xl;
				1162	const uint32_t x1 = c->iterators.xr;
				1163	const uint32_t xc = x1 - xs;
				1164
				1165	const iterators_t& ci = c->iterators;
				1166	int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
				1167	int32_t iw = gglRecipQ(w, 30);
				1168	const int iwscale = 32 - gglClz(iw);
				1169
				1170	const int i = 31 - gglClz(c->state.enabled_tmu);
				1171	generated_tex_vars_t& gen = c->generated_vars.texture[i];
				1172	texture_t& tmu = c->state.texture[i];
				1173	texture_iterators_t& ti = tmu.iterators;
				1174	const int sscale = ti.sscale + (iwscale - 30);
				1175	const int tscale = ti.tscale + (iwscale - 30);
				1176	int32_t s = tmu.shade.is0 +
				1177	(tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
				1178	((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
				1179	int32_t t = tmu.shade.it0 +
				1180	(tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
				1181	((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
				1182	int32_t s0 = gglMulx(s, iw, iwscale);
				1183	int32_t t0 = gglMulx(t, iw, iwscale);
				1184	int32_t xl = c->iterators.xl;
				1185
				1186	int32_t sq, tq, dsdx, dtdx;
				1187	int32_t premainder = xc & ((1<<SPAN_BITS)-1);
				1188	uint32_t numSpans = xc >> SPAN_BITS;
				1189	if (c->shade.dwdx == 0) {
				1190	// XXX: we could choose to do this if the error is small enough
				1191	numSpans = 0;
				1192	premainder = xc;
				1193	goto no_perspective;
				1194	}
				1195
				1196	if (premainder) {
				1197	w += c->shade.dwdx * premainder;
				1198	iw = gglRecipQ(w, 30);
				1199	no_perspective:
				1200	s += tmu.shade.idsdx * premainder;
				1201	t += tmu.shade.idtdx * premainder;
				1202	sq = gglMulx(s, iw, iwscale);
				1203	tq = gglMulx(t, iw, iwscale);
				1204	dsdx = (sq - s0) / premainder;
				1205	dtdx = (tq - t0) / premainder;
				1206	c->iterators.xl = xl;
				1207	c->iterators.xr = xl = xl + premainder;
				1208	goto finish;
				1209	}
				1210
				1211	while (numSpans--) {
				1212	w += c->shade.dwdx << SPAN_BITS;
				1213	s += tmu.shade.idsdx << SPAN_BITS;
				1214	t += tmu.shade.idtdx << SPAN_BITS;
				1215	iw = gglRecipQ(w, 30);
				1216	sq = gglMulx(s, iw, iwscale);
				1217	tq = gglMulx(t, iw, iwscale);
				1218	dsdx = (sq - s0) >> SPAN_BITS;
				1219	dtdx = (tq - t0) >> SPAN_BITS;
				1220	c->iterators.xl = xl;
				1221	c->iterators.xr = xl = xl + (1<<SPAN_BITS);
				1222	finish:
				1223	if (sscale >= 0) {
				1224	ti.ydsdy = s0 << sscale;
				1225	ti.dsdx = dsdx << sscale;
				1226	} else {
				1227	ti.ydsdy = s0 >>-sscale;
				1228	ti.dsdx = dsdx >>-sscale;
				1229	}
				1230	if (tscale >= 0) {
				1231	ti.ydtdy = t0 << tscale;
				1232	ti.dtdx = dtdx << tscale;
				1233	} else {
				1234	ti.ydtdy = t0 >>-tscale;
				1235	ti.dtdx = dtdx >>-tscale;
				1236	}
				1237	s0 = sq;
				1238	t0 = tq;
				1239	gen.dsdx = ti.dsdx;
				1240	gen.dtdx = ti.dtdx;
				1241	c->span(c);
				1242	}
				1243	}
				1244
				1245	// ----------------------------------------------------------------------------
				1246
				1247	void scanline_t32cb16(context_t* c)
				1248	{
				1249	int32_t x = c->iterators.xl;
				1250	size_t ct = c->iterators.xr - x;
				1251	int32_t y = c->iterators.y;
				1252	surface_t* cb = &(c->state.buffers.color);
				1253	union {
				1254	uint16_t* dst;
				1255	uint32_t* dst32;
				1256	};
				1257	dst = reinterpret_cast<uint16_t>(cb->data) + (x+(cb->stridey));
				1258
				1259	surface_t* tex = &(c->state.texture[0].surface);
				1260	const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
				1261	const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
				1262	uint32_t src = reinterpret_cast<uint32_t>(tex->data)+(u+(tex->stride*v));
				1263	int sR, sG, sB;
				1264	uint32_t s, d;
				1265
				1266	if (ct==1 \|\| uint32_t(dst)&2) {
				1267	last_one:
				1268	s = GGL_RGBA_TO_HOST( *src++ );
				1269	sR = (s >> ( 3))&0x1F;
				1270	sG = (s >> ( 8+2))&0x3F;
				1271	sB = (s >> (16+3))&0x1F;
				1272	*dst++ = uint16_t((sR<<11)\|(sG<<5)\|sB);
				1273	ct--;
				1274	}
				1275
				1276	while (ct > 0) {
				1277	s = GGL_RGBA_TO_HOST( *src++ );
				1278	sR = (s >> ( 3))&0x1F;
				1279	sG = (s >> ( 8+2))&0x3F;
				1280	sB = (s >> (16+3))&0x1F;
				1281	d = (sR<<11)\|(sG<<5)\|sB;
				1282
				1283	s = GGL_RGBA_TO_HOST( *src++ );
				1284	sR = (s >> ( 3))&0x1F;
				1285	sG = (s >> ( 8+2))&0x3F;
				1286	sB = (s >> (16+3))&0x1F;
				1287	d \|= ((sR<<11)\|(sG<<5)\|sB)<<16;
				1288
				1289	#if BYTE_ORDER == BIG_ENDIAN
				1290	d = (d>>16) \| (d<<16);
				1291	#endif
				1292
				1293	*dst32++ = d;
				1294	ct -= 2;
				1295	}
				1296
				1297	if (ct > 0) {
				1298	goto last_one;
				1299	}
				1300	}
				1301
				1302	void scanline_t32cb16blend(context_t* c)
				1303	{
				1304	int32_t x = c->iterators.xl;
				1305	size_t ct = c->iterators.xr - x;
				1306	int32_t y = c->iterators.y;
				1307	surface_t* cb = &(c->state.buffers.color);
				1308	uint16_t* dst = reinterpret_cast<uint16_t>(cb->data) + (x+(cb->stridey));
				1309
				1310	surface_t* tex = &(c->state.texture[0].surface);
				1311	const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
				1312	const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
				1313	uint32_t src = reinterpret_cast<uint32_t>(tex->data)+(u+(tex->stride*v));
				1314
				1315	#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
				1316	scanline_t32cb16blend_arm(dst, src, ct);
				1317	#else
				1318	while (ct--) {
				1319	uint32_t s = *src++;
				1320	if (!s) {
				1321	dst++;
				1322	continue;
				1323	}
				1324	uint16_t d = *dst;
				1325	s = GGL_RGBA_TO_HOST(s);
				1326	int sR = (s >> ( 3))&0x1F;
				1327	int sG = (s >> ( 8+2))&0x3F;
				1328	int sB = (s >> (16+3))&0x1F;
				1329	int sA = (s>>24);
				1330	int f = 0x100 - (sA + (sA>>7));
				1331	int dR = (d>>11)&0x1f;
				1332	int dG = (d>>5)&0x3f;
				1333	int dB = (d)&0x1f;
				1334	sR += (f*dR)>>8;
				1335	sG += (f*dG)>>8;
				1336	sB += (f*dB)>>8;
				1337	*dst++ = uint16_t((sR<<11)\|(sG<<5)\|sB);
				1338	}
				1339	#endif
				1340	}
				1341
				1342	void scanline_memcpy(context_t* c)
				1343	{
				1344	int32_t x = c->iterators.xl;
				1345	size_t ct = c->iterators.xr - x;
				1346	int32_t y = c->iterators.y;
				1347	surface_t* cb = &(c->state.buffers.color);
				1348	const GGLFormat* fp = &(c->formats[cb->format]);
				1349	uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
				1350	(x + (cb->stride * y)) * fp->size;
				1351
				1352	surface_t* tex = &(c->state.texture[0].surface);
				1353	const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
				1354	const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
				1355	uint8_t src = reinterpret_cast<uint8_t>(tex->data) +
				1356	(u + (tex->stride * v)) * fp->size;
				1357
				1358	const size_t size = ct * fp->size;
				1359	memcpy(dst, src, size);
				1360	}
				1361
				1362	void scanline_memset8(context_t* c)
				1363	{
				1364	int32_t x = c->iterators.xl;
				1365	size_t ct = c->iterators.xr - x;
				1366	int32_t y = c->iterators.y;
				1367	surface_t* cb = &(c->state.buffers.color);
				1368	uint8_t* dst = reinterpret_cast<uint8_t>(cb->data) + (x+(cb->stridey));
				1369	uint32_t packed = c->packed;
				1370	memset(dst, packed, ct);
				1371	}
				1372
				1373	void scanline_memset16(context_t* c)
				1374	{
				1375	int32_t x = c->iterators.xl;
				1376	size_t ct = c->iterators.xr - x;
				1377	int32_t y = c->iterators.y;
				1378	surface_t* cb = &(c->state.buffers.color);
				1379	uint16_t* dst = reinterpret_cast<uint16_t>(cb->data) + (x+(cb->stridey));
				1380	uint32_t packed = c->packed;
				1381	android_memset16(dst, packed, ct*2);
				1382	}
				1383
				1384	void scanline_memset32(context_t* c)
				1385	{
				1386	int32_t x = c->iterators.xl;
				1387	size_t ct = c->iterators.xr - x;
				1388	int32_t y = c->iterators.y;
				1389	surface_t* cb = &(c->state.buffers.color);
				1390	uint32_t* dst = reinterpret_cast<uint32_t>(cb->data) + (x+(cb->stridey));
				1391	uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
				1392	android_memset32(dst, packed, ct*4);
				1393	}
				1394
				1395	void scanline_clear(context_t* c)
				1396	{
				1397	int32_t x = c->iterators.xl;
				1398	size_t ct = c->iterators.xr - x;
				1399	int32_t y = c->iterators.y;
				1400	surface_t* cb = &(c->state.buffers.color);
				1401	const GGLFormat* fp = &(c->formats[cb->format]);
				1402	uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
				1403	(x + (cb->stride * y)) * fp->size;
				1404	const size_t size = ct * fp->size;
				1405	memset(dst, 0, size);
				1406	}
				1407
				1408	void scanline_set(context_t* c)
				1409	{
				1410	int32_t x = c->iterators.xl;
				1411	size_t ct = c->iterators.xr - x;
				1412	int32_t y = c->iterators.y;
				1413	surface_t* cb = &(c->state.buffers.color);
				1414	const GGLFormat* fp = &(c->formats[cb->format]);
				1415	uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
				1416	(x + (cb->stride * y)) * fp->size;
				1417	const size_t size = ct * fp->size;
				1418	memset(dst, 0xFF, size);
				1419	}
				1420
				1421	void scanline_noop(context_t* c)
				1422	{
				1423	}
				1424
				1425	void rect_generic(context_t* c, size_t yc)
				1426	{
				1427	do {
				1428	c->scanline(c);
				1429	c->step_y(c);
				1430	} while (--yc);
				1431	}
				1432
				1433	void rect_memcpy(context_t* c, size_t yc)
				1434	{
				1435	int32_t x = c->iterators.xl;
				1436	size_t ct = c->iterators.xr - x;
				1437	int32_t y = c->iterators.y;
				1438	surface_t* cb = &(c->state.buffers.color);
				1439	const GGLFormat* fp = &(c->formats[cb->format]);
				1440	uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
				1441	(x + (cb->stride * y)) * fp->size;
				1442
				1443	surface_t* tex = &(c->state.texture[0].surface);
				1444	const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
				1445	const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
				1446	uint8_t src = reinterpret_cast<uint8_t>(tex->data) +
				1447	(u + (tex->stride * v)) * fp->size;
				1448
				1449	if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
				1450	memcpy(dst, src, ct * fp->size * yc);
				1451	} else {
				1452	const size_t size = ct * fp->size;
				1453	const size_t dbpr = cb->stride * fp->size;
				1454	const size_t sbpr = tex->stride * fp->size;
				1455	do {
				1456	memcpy(dst, src, size);
				1457	dst += dbpr;
				1458	src += sbpr;
				1459	} while (--yc);
				1460	}
				1461	}
				1462	// ----------------------------------------------------------------------------
				1463	}; // namespace android
				1464
				1465	using namespace android;
				1466	extern "C" void ggl_test_codegen(uint32_t n, uint32_t p, uint32_t t0, uint32_t t1)
				1467	{
				1468	#if ANDROID_ARM_CODEGEN
				1469	GGLContext* c;
				1470	gglInit(&c);
				1471	needs_t needs;
				1472	needs.n = n;
				1473	needs.p = p;
				1474	needs.t[0] = t0;
				1475	needs.t[1] = t1;
				1476	sp<ScanlineAssembly> a(new ScanlineAssembly(needs, 1024));
				1477	GGLAssembler assembler( new ARMAssembler(a) );
				1478	int err = assembler.scanline(needs, (context_t*)c);
				1479	if (err != 0) {
				1480	printf("error %08x (%s)\n", err, strerror(-err));
				1481	}
				1482	gglUninit(c);
				1483	#else
				1484	printf("This test runs only on ARM\n");
				1485	#endif
				1486	}
				1487