/* $NetBSD: cg14_accel.c,v 1.33 2024/05/13 10:13:10 macallan Exp $ */
/*
 * Copyright (c) 2013 Michael Lorenz
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *    - Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    - Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <sys/types.h>

/* all driver need this */
#include "xf86.h"
#include "xf86_OSproc.h"
#include "compiler.h"

#include "cg14.h"

/*#define SX_DEBUG*/
/*#define SX_TRACE*/

#ifdef SX_TRACE
#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
#else
#define ENTER
#endif

#ifdef SX_DEBUG
#define DPRINTF xf86Msg
#else
#define DPRINTF while (0) xf86Msg
#endif

#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))

/* 0xcc is SX's GXcopy equivalent */
uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};

int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};

static void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
static void CG14Copy16(PixmapPtr, int, int, int, int, int, int);
static void CG14Copy8(PixmapPtr, int, int, int, int, int, int);

static inline void
CG14Wait(Cg14Ptr p)
{
	int bail = 10000000;
	/* we wait for the busy bit to clear */
	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
	       (bail > 0)) {
		bail--;
	};
	if (bail == 0) {
		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
		    read_sx_reg(p, SX_CONTROL_STATUS),
		    read_sx_reg(p, SX_ERROR));
	}
}

static void
CG14WaitMarker(ScreenPtr pScreen, int Marker)
{
	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);

	CG14Wait(p);
}

static Bool
CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
		int xdir, int ydir, int alu, Pixel planemask)
{
	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);

	ENTER;
	DPRINTF(X_ERROR, "%s bpp %d rop %x\n", __func__,
	    pSrcPixmap->drawable.bitsPerPixel, alu);

	if (planemask != p->last_mask) {
		CG14Wait(p);
		write_sx_reg(p, SX_PLANEMASK, planemask);
		p->last_mask = planemask;
	}
	alu = sx_rop[alu];
	if (alu != p->last_rop) {
		CG14Wait(p);
		write_sx_reg(p, SX_ROP_CONTROL, alu);
		p->last_rop = alu;
	}
	switch (pSrcPixmap->drawable.bitsPerPixel)  {
		case 8:
			p->pExa->Copy = CG14Copy8;
			break;
		case 16:
			p->pExa->Copy = CG14Copy16;
			break;
		case 32:
			p->pExa->Copy = CG14Copy32;
			break;
		default:
			DPRINTF(X_ERROR, "%s depth %d\n", __func__,
			    pSrcPixmap->drawable.bitsPerPixel);
	}
	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
	p->xdir = xdir;
	p->ydir = ydir;
	return TRUE;
}

static void
CG14Copy32(PixmapPtr pDstPixmap,
         int srcX, int srcY, int dstX, int dstY, int w, int h)
{
	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	int dstpitch, dstoff, srcpitch, srcoff;
	int srcstart, dststart, xinc, srcinc, dstinc;
	int line, count, s, d, num;

	ENTER;
	dstpitch = exaGetPixmapPitch(pDstPixmap);
	dstoff = exaGetPixmapOffset(pDstPixmap);
	srcpitch = p->srcpitch;
	srcoff = p->srcoff;
	/*
	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
	 * actually wrote anything and only sync if it did
	 */
	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;

	/*
	 * we always copy up to 32 pixels at a time so direction doesn't
	 * matter if w<=32
	 */
	if (w > 32) {
		if (p->xdir < 0) {
			srcstart += (w - 32) << 2;
			dststart += (w - 32) << 2;
			xinc = -128;
		} else
			xinc = 128;
	} else
		xinc = 128;
	if (p->ydir < 0) {
		srcstart += (h - 1) * srcpitch;
		dststart += (h - 1) * dstpitch;
		srcinc = -srcpitch;
		dstinc = -dstpitch;
	} else {
		srcinc = srcpitch;
		dstinc = dstpitch;
	}
	if (p->last_rop == 0xcc) {
		/* plain old copy */
		if ( xinc > 0) {
			/* going left to right */
			for (line = 0; line < h; line++) {
				count = 0;
				s = srcstart;
				d = dststart;
				while ( count < w) {
					num = min(32, w - count);
					sxm(SX_LD, s, 10, num - 1);
					sxm(SX_STM, d, 10, num - 1);
					s += xinc;
					d += xinc;
					count += 32;
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		} else {
			/* going right to left */
			int i, chunks = (w >> 5);
			for (line = 0; line < h; line++) {
				s = srcstart;
				d = dststart;
				count = w;
				for (i = 0; i < chunks; i++) {
					sxm(SX_LD, s, 10, 31);
					sxm(SX_STM, d, 10, 31);
					s -= 128;
					d -= 128;
					count -= 32;
				}
				/* leftovers, if any */
				if (count > 0) {
					s += (32 - count) << 2;
					d += (32 - count) << 2;
					sxm(SX_LD, s, 10, count - 1);
					sxm(SX_STM, d, 10, count - 1);
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		}
	} else {
		/* ROPs needed */
		if ( xinc > 0) {
			/* going left to right */
			for (line = 0; line < h; line++) {
				count = 0;
				s = srcstart;
				d = dststart;
				while ( count < w) {
					num = min(32, w - count);
					sxm(SX_LD, s, 10, num - 1);
					sxm(SX_LD, d, 42, num - 1);
					if (num > 16) {
						sxi(SX_ROP, 10, 42, 74, 15);
						sxi(SX_ROP, 26, 58, 90, num - 17);
					} else {
						sxi(SX_ROP, 10, 42, 74, num - 1);
					}
					sxm(SX_STM, d, 74, num - 1);
					s += xinc;
					d += xinc;
					count += 32;
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		} else {
			/* going right to left */
			int i, chunks = (w >> 5);
			for (line = 0; line < h; line++) {
				s = srcstart;
				d = dststart;
				count = w;
				for (i = 0; i < chunks; i++) {
					sxm(SX_LD, s, 10, 31);
					sxm(SX_LD, d, 42, 31);
					sxi(SX_ROP, 10, 42, 74, 15);
					sxi(SX_ROP, 26, 58, 90, 15);
					sxm(SX_STM, d, 74, 31);
					s -= 128;
					d -= 128;
					count -= 32;
				}
				/* leftovers, if any */
				if (count > 0) {
					s += (32 - count) << 2;
					d += (32 - count) << 2;
					sxm(SX_LD, s, 10, count - 1);
					sxm(SX_LD, d, 42, count - 1);
					if (count > 16) {
						sxi(SX_ROP, 10, 42, 74, 15);
						sxi(SX_ROP, 26, 58, 90, count - 17);
					} else {
						sxi(SX_ROP, 10, 42, 74, count - 1);
					}
					sxm(SX_STM, d, 74, count - 1);
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		}
	}			
	exaMarkSync(pDstPixmap->drawable.pScreen);
}

/*
 * copy with same alignment, left to right, no ROP
 */
static void
CG14Copy8_aligned_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
    int srcpitch, int dstpitch)
{
	int saddr, daddr, pre, cnt, wrds;

	ENTER;
	
	pre = srcstart & 3;
	if (pre != 0) pre = 4 - pre;
	pre = min(pre, w);

	while (h > 0) {
		saddr = srcstart;
		daddr = dststart;
		cnt = w;
		if (pre > 0) {
			sxm(SX_LDB, saddr, 8, pre - 1);
			sxm(SX_STB, daddr, 8, pre - 1);
			saddr += pre;
			daddr += pre;
			cnt -= pre;
			if (cnt == 0) goto next;
		}
		while (cnt > 3) {
			wrds = min(32, cnt >> 2);
			sxm(SX_LD, saddr, 8, wrds - 1);
			sxm(SX_ST, daddr, 8, wrds - 1);
			saddr += wrds << 2;
			daddr += wrds << 2;
			cnt -= wrds << 2;
		}
		if (cnt > 0) {
			sxm(SX_LDB, saddr, 8, cnt - 1);
			sxm(SX_STB, daddr, 8, cnt - 1);
		}
next:
		srcstart += srcpitch;
		dststart += dstpitch;
		h--;
	}
}

/*
 * copy with same alignment, left to right, ROP
 */
static void
CG14Copy8_aligned_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
    int srcpitch, int dstpitch)
{
	int saddr, daddr, pre, cnt, wrds;

	ENTER;
	
	pre = srcstart & 3;
	if (pre != 0) pre = 4 - pre;
	pre = min(pre, w);

	while (h > 0) {
		saddr = srcstart;
		daddr = dststart;
		cnt = w;
		if (pre > 0) {
			sxm(SX_LDB, saddr, 8, pre - 1);
			sxm(SX_LDB, daddr, 40, pre - 1);
			sxi(SX_ROP, 8, 40, 72, pre - 1);
			sxm(SX_STB, daddr, 72, pre - 1);
			saddr += pre;
			daddr += pre;
			cnt -= pre;
			if (cnt == 0) goto next;
		}
		while (cnt > 3) {
			wrds = min(32, cnt >> 2);
			sxm(SX_LD, saddr, 8, wrds - 1);
			sxm(SX_LD, daddr, 40, wrds - 1);
			if (cnt > 16) {
				sxi(SX_ROP, 8, 40, 72, 15);
				sxi(SX_ROP, 8, 56, 88, wrds - 17);
			} else
				sxi(SX_ROP, 8, 40, 72, wrds - 1);
			sxm(SX_ST, daddr, 72, wrds - 1);
			saddr += wrds << 2;
			daddr += wrds << 2;
			cnt -= wrds << 2;
		}
		if (cnt > 0) {
			sxm(SX_LDB, saddr, 8, cnt - 1);
			sxm(SX_LDB, daddr, 40, cnt - 1);
			sxi(SX_ROP, 8, 40, 72, cnt - 1);
			sxm(SX_STB, daddr, 72, cnt - 1);
		}
next:
		srcstart += srcpitch;
		dststart += dstpitch;
		h--;
	}
}

/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
static void
CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
{
	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
	int ssreg;
#ifdef DEBUG
	int taddr = 4 + dstpitch * 50;
#endif
	uint32_t lmask, rmask;
	ENTER;
	
	pre = dststart & 3;
	lmask = 0xffffffff >> pre;
	spre = srcstart & 3;
	/*
	 * make sure we count all the words needed to cover the destination 
	 * line, covering potential partials on both ends
	 */
	wrds = (w + pre + 3) >> 2;
	swrds = (w + spre + 3) >> 2;

	if (spre < pre) {
		dist = 32 - (pre - spre) * 8;
		sreg = 9;
	} else {
		dist = (spre - pre) * 8;
		sreg = 8;
	}

	/*
	 * mask out trailing pixels to avoid partial writes
	 */
	post = (dststart + w) & 3;
	if (post != 0) {
		rmask = ~(0xffffffff >> (post * 8));
		write_sx_reg(p, SX_QUEUED(7), rmask);	
		write_sx_reg(p, SX_QUEUED(6), ~rmask);	
	}

	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);

	/* mask out the leading pixels in dst by using a mask and ROP */
	if (pre != 0) {
		CG14Wait(p);
		write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
		write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);	
	}

	saddr = srcstart & ~3;
	daddr = dststart & ~3;

	while (h > 0) {
		sxm(SX_LD, daddr, 80, wrds - 1);
		sxm(SX_LD, saddr, sreg, swrds - 1);
		if (wrds > 15) {
			if (dist != 0) {
				sxi(SX_FUNNEL_I, 8, dist, 40, 15);
				sxi(SX_FUNNEL_I, 24, dist, 56, wrds - 16);
				/* shifted source pixels are now at register 40+ */
				ssreg = 40;
			} else ssreg = 8;
			if (pre != 0) {
				/* mask out leading junk */
				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
				sxi(SX_ROPB, ssreg, 80, 8, 0);
				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
				sxi(SX_ROPB, ssreg + 1, 81, 9, 14);	
			} else {
				sxi(SX_ROPB, ssreg, 80, 8, 15);
			}
			sxi(SX_ROPB, ssreg + 16, 96, 24, wrds - 16);
		} else {
			if (dist != 0) {
				sxi(SX_FUNNEL_I, 8, dist, 40, wrds);
				ssreg = 40;
			} else ssreg = 8;
			if (pre != 0) {
				/* mask out leading junk */
				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
				sxi(SX_ROPB, ssreg, 80, 8, 0);
				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
				sxi(SX_ROPB, ssreg + 1, 81, 9, wrds);
			} else {
				sxi(SX_ROPB, ssreg, 80, 8, wrds);
			}
		}
		if (post != 0) {
			/*
			 * if the last word to be written out is a partial we 
			 * mask out the leftovers and replace them with
			 * background pixels
			 * we could pull the same ROP * mask trick as we do on
			 * the left end but it's less annoying this way and
			 * the instruction count is the same
			 */
			sxi(SX_ANDS, 7 + wrds, 7, 5, 0);
			sxi(SX_ANDS, 79 + wrds, 6, 4, 0);
			sxi(SX_ORS, 5, 4, 7 + wrds, 0);
		}
#ifdef DEBUG
		sxm(SX_ST, taddr, 40, wrds - 1);
		taddr += dstpitch;
#endif
		sxm(SX_ST, daddr, 8, wrds - 1);
		saddr += srcpitch;
		daddr += dstpitch;
		h--;
	}
}

/* up to 124 pixels so direction doesn't matter, unaligned, straight copy */
static void
CG14Copy8_short_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
    int srcpitch, int dstpitch)
{
	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
	int ssreg;
#ifdef DEBUG
	int taddr = 4 + dstpitch * 50;
#endif
	uint32_t lmask, rmask;
	ENTER;
	
	pre = dststart & 3;
	lmask = 0xffffffff >> pre;
	spre = srcstart & 3;
	/*
	 * make sure we count all the words needed to cover the destination 
	 * line, covering potential partials on both ends
	 */
	wrds = (w + pre + 3) >> 2;
	swrds = (w + spre + 3) >> 2;

	if (spre < pre) {
		dist = 32 - (pre - spre) * 8;
		sreg = 9;
	} else {
		dist = (spre - pre) * 8;
		sreg = 8;
	}

	/*
	 * mask out trailing pixels to avoid partial writes
	 */
	post = (dststart + w) & 3;
	if (post != 0) {
		rmask = ~(0xffffffff >> (post * 8));
		write_sx_reg(p, SX_QUEUED(7), rmask);	
		write_sx_reg(p, SX_QUEUED(6), ~rmask);	
	}

	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);

	/* mask out the leading pixels in dst by using a mask and ROP */
	if (pre != 0) {
		CG14Wait(p);
		write_sx_reg(p, SX_ROP_CONTROL, 0xca);
		write_sx_reg(p, SX_QUEUED(R_MASK), lmask);	
	}

	saddr = srcstart & ~3;
	daddr = dststart & ~3;
	
	while (h > 0) {
		sxm(SX_LD, saddr, sreg, swrds - 1);
		if (wrds > 15) {
			if (dist != 0) {
				sxi(SX_FUNNEL_I, 8, dist, 40, 15);
				sxi(SX_FUNNEL_I, 24, dist, 56, wrds - 16);
				/* shifted source pixels are now at reg 40+ */
				ssreg = 40;
			} else ssreg = 8;
			if (pre != 0) {
				/* read only the first word */
				sxm(SX_LD, daddr, 80, 0);
				/* mask out leading junk */
				sxi(SX_ROPB, ssreg, 80, ssreg, 0);
			}
		} else {
			if (dist != 0) {
				sxi(SX_FUNNEL_I, 8, dist, 40, wrds);
				ssreg = 40;
			} else ssreg = 8;
			if (pre != 0) {
				/* read only the first word */
				sxm(SX_LD, daddr, 80, 0);
				/* mask out leading junk */
				sxi(SX_ROPB, ssreg, 80, ssreg, 0);
			}
		}
		if (post != 0) {
			int laddr = daddr + ((wrds - 1) << 2);
			/*
			 * if the last word to be written out is a partial we 
			 * mask out the leftovers and replace them with
			 * background pixels
			 * we could pull the same ROP * mask trick as we do on
			 * the left end but it's less annoying this way and
			 * the instruction count is the same
			 */
			sxm(SX_LD, laddr, 81, 0);
			sxi(SX_ANDS, ssreg + wrds - 1, 7, 5, 0);
			sxi(SX_ANDS, 81, 6, 4, 0);
			sxi(SX_ORS, 5, 4, ssreg + wrds - 1, 0);
		}
#ifdef DEBUG
		sxm(SX_ST, taddr, 40, wrds - 1);
		taddr += dstpitch;
#endif
		sxm(SX_ST, daddr, ssreg, wrds - 1);
		saddr += srcpitch;
		daddr += dstpitch;
		h--;
	}
}

static void
CG14Copy16(PixmapPtr pDstPixmap,
         int srcX, int srcY, int dstX, int dstY, int w, int h)
{
	CG14Copy8(pDstPixmap, srcX << 1, srcY, dstX << 1, dstY, w << 1, h);
}

static void
CG14Copy8(PixmapPtr pDstPixmap,
         int srcX, int srcY, int dstX, int dstY, int w, int h)
{
	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	int dstpitch, dstoff, srcpitch, srcoff;
	int srcstart, dststart, xinc, srcinc, dstinc;
	int line, count, s, d, num;

	ENTER;
	dstpitch = exaGetPixmapPitch(pDstPixmap);
	dstoff = exaGetPixmapOffset(pDstPixmap);
	srcpitch = p->srcpitch;
	srcoff = p->srcoff;
	/*
	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
	 * actually wrote anything and only sync if it did
	 */
	srcstart = srcX + (srcpitch * srcY) + srcoff;
	dststart = dstX + (dstpitch * dstY) + dstoff;

	if (p->ydir < 0) {
		srcstart += (h - 1) * srcpitch;
		dststart += (h - 1) * dstpitch;
		srcinc = -srcpitch;
		dstinc = -dstpitch;
	} else {
		srcinc = srcpitch;
		dstinc = dstpitch;
	}

	/*
	 * this copies up to 124 pixels wide in one go, so horizontal
	 * direction / overlap don't matter
	 * uses all 32bit accesses and funnel shifter for unaligned copies
	 */
	if ((w < 125) && (w > 8)) {
		switch (p->last_rop) {
			case 0xcc:
				CG14Copy8_short_norop(p,
				    srcstart, dststart, w, h, srcinc, dstinc);
				break;
			default:
				CG14Copy8_short_rop(p,
				    srcstart, dststart, w, h, srcinc, dstinc);
		}
		return;
	}

	/*
	 * only invert x direction if absolutely necessary, it's a pain to
	 * go backwards on SX so avoid as much as possible
	 */
	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
		xinc = -32;
	} else
		xinc = 32;

	/*
	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
	 * most common case
	 */
	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
		switch (p->last_rop) {
			case 0xcc:
				CG14Copy8_aligned_norop(p,
				    srcstart, dststart, w, h, srcinc, dstinc);
				break;
			default:
				CG14Copy8_aligned_rop(p,
				    srcstart, dststart, w, h, srcinc, dstinc);
		}
		return;
	}

	/*
	 * if we make it here we either have something large and unaligned,
	 * something we need to do right to left, or something tiny.
	 * we handle the non-tiny cases by breaking them down into chunks that
	 * Copy8_short_*() can handle, making sure the destinations are 32bit 
	 * aligned whenever possible
	 * since we copy by block, not by line we need to go backwards even if
	 * we don't copy within the same line
	 */
	if (w > 8) {
		int next, wi, end = dststart + w;
		DPRINTF(X_ERROR, "%s %08x %08x %d\n",
		    __func__, srcstart, dststart, w);
		if ((p->xdir < 0) && (srcoff == dstoff)) {		
			srcstart += w;
			next = max((end - 120) & ~3, dststart);
			wi = end - next;
			srcstart -= wi;
			while (wi > 0) {
				DPRINTF(X_ERROR, "%s RL %08x %08x %d\n",
				    __func__, srcstart, next, wi);
				if (p->last_rop == 0xcc) {
					CG14Copy8_short_norop(p, srcstart,
					    next, wi, h, srcinc, dstinc);
				} else
					CG14Copy8_short_rop(p, srcstart,
					    next, wi, h, srcinc, dstinc);
				end = next;
				/*
				 * avoid extremely narrow copies so I don't
				 * have to deal with dangling start and end
				 * pixels in the same word
				 */
				if ((end - dststart) < 140) {
					next = max((end - 80) & ~3, dststart);
				} else {
					next = max((end - 120) & ~3, dststart);
				}
				wi = end - next;
				srcstart -= wi;
			}
		} else {
			next = min(end, (dststart + 124) & ~3);
			wi = next - dststart;
			while (wi > 0) {
				DPRINTF(X_ERROR, "%s LR %08x %08x %d\n",
				    __func__, srcstart, next, wi);
				if (p->last_rop == 0xcc) {
					CG14Copy8_short_norop(p, 
					    srcstart, dststart, wi, h,
					    srcinc, dstinc);
				} else
					CG14Copy8_short_rop(p,
					    srcstart, dststart, wi, h,
					    srcinc, dstinc);
				srcstart += wi;
				dststart = next;
				if ((end - dststart) < 140) {
					next = min(end, (dststart + 84) & ~3);
				} else {
					next = min(end, (dststart + 124) & ~3);
				}
				wi = next - dststart;
			}
		}
		return;
	}
	if (xinc < 0) {
		srcstart += (w - 32);
		dststart += (w - 32);
	}

	DPRINTF(X_ERROR, "%s fallback to byte-wise %d %d\n", __func__, w, h);
	if (p->last_rop == 0xcc) {
		/* plain old copy */
		if ( xinc > 0) {
			/* going left to right */
			for (line = 0; line < h; line++) {
				count = 0;
				s = srcstart;
				d = dststart;
				while ( count < w) {
					num = min(32, w - count);
					sxm(SX_LDB, s, 10, num - 1);
					sxm(SX_STBM, d, 10, num - 1);
					s += xinc;
					d += xinc;
					count += 32;
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		} else {
			/* going right to left */
			int i, chunks = (w >> 5);
			for (line = 0; line < h; line++) {
				s = srcstart;
				d = dststart;
				count = w;
				for (i = 0; i < chunks; i++) {
					sxm(SX_LDB, s, 10, 31);
					sxm(SX_STBM, d, 10, 31);
					s -= 32;
					d -= 32;
					count -= 32;
				}
				/* leftovers, if any */
				if (count > 0) {
					s += (32 - count);
					d += (32 - count);
					sxm(SX_LDB, s, 10, count - 1);
					sxm(SX_STBM, d, 10, count - 1);
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		}
	} else {
		/* ROPs needed */
		if ( xinc > 0) {
			/* going left to right */
			for (line = 0; line < h; line++) {
				count = 0;
				s = srcstart;
				d = dststart;
				while ( count < w) {
					num = min(32, w - count);
					sxm(SX_LDB, s, 10, num - 1);
					sxm(SX_LDB, d, 42, num - 1);
					if (num > 16) {
						sxi(SX_ROP, 10, 42, 74, 15);
						sxi(SX_ROP, 26, 58, 90, num - 17);
					} else {
						sxi(SX_ROP, 10, 42, 74, num - 1);
					}
					sxm(SX_STBM, d, 74, num - 1);
					s += xinc;
					d += xinc;
					count += 32;
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		} else {
			/* going right to left */
			int i, chunks = (w >> 5);
			for (line = 0; line < h; line++) {
				s = srcstart;
				d = dststart;
				count = w;
				for (i = 0; i < chunks; i++) {
					sxm(SX_LDB, s, 10, 31);
					sxm(SX_LDB, d, 42, 31);
					sxi(SX_ROP, 10, 42, 74, 15);
					sxi(SX_ROP, 26, 58, 90, 15);
					sxm(SX_STBM, d, 74, 31);
					s -= 128;
					d -= 128;
					count -= 32;
				}
				/* leftovers, if any */
				if (count > 0) {
					s += (32 - count);
					d += (32 - count);
					sxm(SX_LDB, s, 10, count - 1);
					sxm(SX_LDB, d, 42, count - 1);
					if (count > 16) {
						sxi(SX_ROP, 10, 42, 74, 15);
						sxi(SX_ROP, 26, 58, 90, count - 17);
					} else {
						sxi(SX_ROP, 10, 42, 74, count - 1);
					}
					sxm(SX_STBM, d, 74, count - 1);
				}
				srcstart += srcinc;
				dststart += dstinc;
			}
		}
	}			
	exaMarkSync(pDstPixmap->drawable.pScreen);
}

static void
CG14DoneCopy(PixmapPtr pDstPixmap)
{
}

static Bool
CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
{
	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);

	ENTER;
	DPRINTF(X_ERROR, "bits per pixel: %d %08lx\n",
	    pPixmap->drawable.bitsPerPixel, fg);

	/*
	 * GXset and GXclear are really just specual cases of GXcopy with
	 * fixed fill colour
	 */
	switch (alu) {
		case GXclear:
			alu = GXcopy;
			fg = 0;
			break;
		case GXset:
			alu = GXcopy;
			fg = 0xffffffff;
			break;
	}
	/* repeat the colour in every sub byte if we're in 8 or 16 bit */
	if (pPixmap->drawable.bitsPerPixel == 8) {
		fg |= fg << 8;
		fg |= fg << 16;
	} else if (pPixmap->drawable.bitsPerPixel == 16) {
		fg |= fg << 16;
	}
	write_sx_reg(p, SX_QUEUED(8), fg);
	write_sx_reg(p, SX_QUEUED(9), fg);
	if (planemask != p->last_mask) {
		CG14Wait(p);
		write_sx_reg(p, SX_PLANEMASK, planemask);
		p->last_mask = planemask;
	}
	alu = sx_rop[alu];
	if (alu != p->last_rop) {
		CG14Wait(p);
		write_sx_reg(p, SX_ROP_CONTROL, alu);
		p->last_rop = alu;
	}

	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
	return TRUE;
}

static void
CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
{
	int line, x, num;
	uint32_t ptr;

	ENTER;
	if (p->last_rop == 0xcc) {
		/* simple fill */
		for (line = 0; line < h; line++) {
			x = 0;
			while (x < w) {
				ptr = start + (x << 2);
				num = min(32, w - x);
				sxm(SX_STS, ptr, 8, num - 1);
				x += 32;
			}
			start += pitch;
		}
	} else if (p->last_rop == 0xaa) {
		/* nothing to do here */
		return;
	} else {
		/* alright, let's do actual ROP stuff */

		/* first repeat the fill colour into 16 registers */
		sxi(SX_SELECT_S, 8, 8, 10, 15);

		for (line = 0; line < h; line++) {
			x = 0;
			while (x < w) {
				ptr = start + (x << 2);
				num = min(32, w - x);
				/* now suck fb data into registers */
				sxm(SX_LD, ptr, 42, num - 1);
				/*
				 * ROP them with the fill data we left in 10
				 * non-memory ops can only have counts up to 16
				 */
				if (num <= 16) {
					sxi(SX_ROP, 10, 42, 74, num - 1);
				} else {
					sxi(SX_ROP, 10, 42, 74, 15);
					sxi(SX_ROP, 10, 58, 90, num - 17);
				}
				/* and write the result back into memory */
				sxm(SX_ST, ptr, 74, num - 1);
				x += 32;
			}
			start += pitch;
		}
	}
}

static void
CG14Solid16(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
{
	int line, num, pre, cnt;
	uint32_t ptr;

	ENTER;
	pre = start & 2;
	if (pre != 0) pre = 1;

	if (p->last_rop == 0xcc) {
		/* simple fill */
		for (line = 0; line < h; line++) {
			ptr = start;
			cnt = w;
			if (pre) {
				sxm(SX_STW, ptr, 8, 0);
				ptr += 2;
				cnt -= 1;
				if (cnt == 0) goto next;
			}
			/* now do the aligned pixels in 32bit chunks */
			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
			while(cnt > 1) {
				num = min(32, cnt >> 1);
				sxm(SX_STS, ptr, 8, num - 1);
				ptr += num << 2;
				cnt -= num << 1;
			}
			if (cnt > 1) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
			if (cnt > 0) {
				sxm(SX_STW, ptr, 8, 0);
			}
next:
			start += pitch;
		}
	} else if (p->last_rop == 0xaa) {
		/* nothing to do here */
		return;
	} else {
		/* alright, let's do actual ROP stuff */

		/* first repeat the fill colour into 16 registers */
		sxi(SX_SELECT_S, 8, 8, 10, 15);

		for (line = 0; line < h; line++) {
			ptr = start;
			cnt = w;
			pre = min(pre, cnt);
			if (pre) {
				sxm(SX_LDW, ptr, 26, 0);
				sxi(SX_ROP, 10, 26, 42, 0);
				sxm(SX_STW, ptr, 42, 0);
				ptr += 2;
				cnt -= 1;
				if (cnt == 0) goto next2;
			}
			/* now do the aligned pixels in 32bit chunks */
			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
			while(cnt > 1) {
				num = min(32, cnt >> 1);
				sxm(SX_LD, ptr, 26, num - 1);
				if (num <= 16) {
					sxi(SX_ROP, 10, 26, 58, num - 1);
				} else {
					sxi(SX_ROP, 10, 26, 58, 15);
					sxi(SX_ROP, 10, 42, 74, num - 17);
				}
				sxm(SX_ST, ptr, 58, num - 1);
				ptr += num << 2;
				cnt -= num << 1;
			}
			if (cnt > 1) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
			if (cnt > 0) {
				sxm(SX_LDW, ptr, 26, 0);
				sxi(SX_ROP, 10, 26, 42, 0);
				sxm(SX_STW, ptr, 42, 0);
			}
next2:
			start += pitch;
		}
	}
}

static void
CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
{
	int line, num, pre, cnt;
	uint32_t ptr;

	ENTER;
	pre = start & 3;
	if (pre != 0) pre = 4 - pre;

	if (p->last_rop == 0xcc) {
		/* simple fill */
		for (line = 0; line < h; line++) {
			ptr = start;
			cnt = w;
			pre = min(pre, cnt);
			if (pre) {
				sxm(SX_STBS, ptr, 8, pre - 1);
				ptr += pre;
				cnt -= pre;
				if (cnt == 0) goto next;
			}
			/* now do the aligned pixels in 32bit chunks */
			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
			while(cnt > 3) {
				num = min(32, cnt >> 2);
				sxm(SX_STS, ptr, 8, num - 1);
				ptr += num << 2;
				cnt -= num << 2;
			}
			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
			if (cnt > 0) {
				sxm(SX_STBS, ptr, 8, cnt - 1);
			}
			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
next:
			start += pitch;
		}
	} else if (p->last_rop == 0xaa) {
		/* nothing to do here */
		return;
	} else {
		/* alright, let's do actual ROP stuff */

		/* first repeat the fill colour into 16 registers */
		sxi(SX_SELECT_S, 8, 8, 10, 15);

		for (line = 0; line < h; line++) {
			ptr = start;
			cnt = w;
			pre = min(pre, cnt);
			if (pre) {
				sxm(SX_LDB, ptr, 26, pre - 1);
				sxi(SX_ROP, 10, 26, 42, pre - 1);
				sxm(SX_STB, ptr, 42, pre - 1);
				ptr += pre;
				cnt -= pre;
				if (cnt == 0) goto next2;
			}
			/* now do the aligned pixels in 32bit chunks */
			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
			while(cnt > 3) {
				num = min(32, cnt >> 2);
				sxm(SX_LD, ptr, 26, num - 1);
				if (num <= 16) {
					sxi(SX_ROP, 10, 26, 58, num - 1);
				} else {
					sxi(SX_ROP, 10, 26, 58, 15);
					sxi(SX_ROP, 10, 42, 74, num - 17);
				}
				sxm(SX_ST, ptr, 58, num - 1);
				ptr += num << 2;
				cnt -= num << 2;
			}
			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
			if (cnt > 0) {
				sxm(SX_LDB, ptr, 26, cnt - 1);
				sxi(SX_ROP, 10, 26, 42, cnt - 1);
				sxm(SX_STB, ptr, 42, cnt - 1);
			}
			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
next2:
			start += pitch;
		}
	}
}

static void
CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
{
	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
	int start, depth;

	ENTER;
	dstpitch = exaGetPixmapPitch(pPixmap);
	dstoff = exaGetPixmapOffset(pPixmap);

	depth = pPixmap->drawable.bitsPerPixel;
	switch (depth) {
		case 32:
			start = dstoff + (y1 * dstpitch) + (x1 << 2);
			CG14Solid32(p, start, dstpitch, w, h);
			break;
		case 16:
			start = dstoff + (y1 * dstpitch) + (x1 << 1);
			CG14Solid16(p, start, dstpitch, w, h);
			break;
		case 8:
			start = dstoff + (y1 * dstpitch) + x1;
			CG14Solid8(p, start, dstpitch, w, h);
			break;
	}

	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
	    dstpitch, dstoff, start);
	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
	exaMarkSync(pPixmap->drawable.pScreen);
}

/*
 * Memcpy-based UTS.
 */
static Bool
CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
    char *src, int src_pitch)
{
	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
	int    dst_pitch  = exaGetPixmapPitch(pDst);

	int bpp    = pDst->drawable.bitsPerPixel;
	int cpp    = (bpp + 7) >> 3;
	int wBytes = w * cpp;

	ENTER;
	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
	dst += (x * cpp) + (y * dst_pitch);

	CG14Wait(p);

	while (h--) {
		memcpy(dst, src, wBytes);
		src += src_pitch;
		dst += dst_pitch;
	}
	__asm("stbar;");
	return TRUE;
}

/*
 * Memcpy-based DFS.
 */
static Bool
CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
    char *dst, int dst_pitch)
{
	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
	int    src_pitch  = exaGetPixmapPitch(pSrc);

	ENTER;
	int bpp    = pSrc->drawable.bitsPerPixel;
	int cpp    = (bpp + 7) >> 3;
	int wBytes = w * cpp;

	src += (x * cpp) + (y * src_pitch);

	CG14Wait(p);

	while (h--) {
		memcpy(dst, src, wBytes);
		src += src_pitch;
		dst += dst_pitch;
	}

	return TRUE;
}

Bool
CG14CheckComposite(int op, PicturePtr pSrcPicture,
                           PicturePtr pMaskPicture,
                           PicturePtr pDstPicture)
{
	int i, ok = FALSE;

	ENTER;

	/*
	 * SX is in theory capable of accelerating pretty much all Xrender ops,
	 * even coordinate transformation and gradients. Support will be added
	 * over time and likely have to spill over into its own source file.
	 */
	
	if ((op != PictOpOver) && (op != PictOpAdd)/* && (op != PictOpSrc)*/) {
		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
		return FALSE;
	}

	if (pSrcPicture != NULL) {
		i = 0;
		while ((i < arraysize(src_formats)) && (!ok)) {
			ok =  (pSrcPicture->format == src_formats[i]);
			i++;
		}

		if (!ok) {
			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
			    __func__, pSrcPicture->format);
			return FALSE;
		}
		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
	}

	if (pDstPicture != NULL) {
		i = 0;
		ok = FALSE;
		while ((i < arraysize(src_formats)) && (!ok)) {
			ok =  (pDstPicture->format == src_formats[i]);
			i++;
		}

		if (!ok) {
			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
			    __func__, pDstPicture->format);
			return FALSE;
		}
		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
	}

	if (pMaskPicture != NULL) {
		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
		    pMaskPicture->pDrawable->width,
		    pMaskPicture->pDrawable->height);
	}
	return TRUE;
}

Bool
CG14PrepareComposite(int op, PicturePtr pSrcPicture,
                             PicturePtr pMaskPicture,
                             PicturePtr pDstPicture,
                             PixmapPtr  pSrc,
                             PixmapPtr  pMask,
                             PixmapPtr  pDst)
{
	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);

	ENTER;

	p->no_source_pixmap = FALSE;
	p->source_is_solid = FALSE;

	if (pSrcPicture->format == PICT_a1) {
		DPRINTF(X_ERROR, "src mono, dst %x, op %d\n",
		    pDstPicture->format, op);
		if (pMaskPicture != NULL) {
			DPRINTF(X_ERROR, "msk %x\n", pMaskPicture->format);
		}
	}
	if (pSrcPicture->pSourcePict != NULL) {
		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
			p->fillcolour =
			    pSrcPicture->pSourcePict->solidFill.color;
			DPRINTF(X_ERROR, "%s: solid src %08x\n",
			    __func__, p->fillcolour);
			p->no_source_pixmap = TRUE;
			p->source_is_solid = TRUE;
		}
	}
	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
		if (pMaskPicture->pSourcePict->type ==
		    SourcePictTypeSolidFill) {
			p->fillcolour = 
			   pMaskPicture->pSourcePict->solidFill.color;
			DPRINTF(X_ERROR, "%s: solid mask %08x\n",
			    __func__, p->fillcolour);
		}
	}
	if (pMaskPicture != NULL) {
		p->mskoff = exaGetPixmapOffset(pMask);
		p->mskpitch = exaGetPixmapPitch(pMask);
		p->mskformat = pMaskPicture->format;
	} else {
		p->mskoff = 0;
		p->mskpitch = 0;
		p->mskformat = 0;
	}
	if (pSrc != NULL) {
		p->source_is_solid = 
		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
		p->srcoff = exaGetPixmapOffset(pSrc);
		p->srcpitch = exaGetPixmapPitch(pSrc);
		if (p->source_is_solid) {
			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
		}
	}
	p->srcformat = pSrcPicture->format;
	p->dstformat = pDstPicture->format;
	
	if (p->source_is_solid) {
		uint32_t temp;

		/* stuff source colour into SX registers, swap as needed */
		temp = p->fillcolour;
		DPRINTF(X_ERROR, "solid %08x\n", temp);
		switch (p->srcformat) {
			case PICT_a8r8g8b8:
			case PICT_x8r8g8b8:
				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
				temp = temp >> 8;
				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
				temp = temp >> 8;
				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
				break;
			case PICT_a8b8g8r8:
			case PICT_x8b8g8r8:
				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
				temp = temp >> 8;
				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
				temp = temp >> 8;
				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
				break;
		}
		write_sx_reg(p, SX_QUEUED(8), 0xff);
	}
	p->op = op;
	if (op == PictOpSrc) {
		if (pSrc == NULL) {
			DPRINTF(X_ERROR, "src type %d\n", pSrcPicture->pSourcePict->type);
			return FALSE;
		}
		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
	}
#ifdef SX_DEBUG
	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
	    *(uint32_t *)(p->fb + p->srcoff));	
#endif
	return TRUE;
}

void
CG14Composite(PixmapPtr pDst, int srcX, int srcY,
                              int maskX, int maskY,
                              int dstX, int dstY,
                              int width, int height)
{
	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	uint32_t dstoff, dstpitch;
	uint32_t dst, msk, src;
	int flip = 0;

	ENTER;
	dstoff = exaGetPixmapOffset(pDst);		
	dstpitch = exaGetPixmapPitch(pDst);

	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
		PICT_FORMAT_TYPE(p->dstformat));

	switch (p->op) {
		case PictOpOver:
			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
			    p->mskformat, p->dstformat, srcX, srcY);
			if (p->source_is_solid) {
				switch (p->mskformat) {
					case PICT_a8:
						msk = p->mskoff + 
						    (maskY * p->mskpitch) +
						    maskX;
						CG14Comp_Over8Solid(p,
						    msk, p->mskpitch,
						    dst, dstpitch,
						    width, height);
						break;
					case PICT_a8r8g8b8:
					case PICT_a8b8g8r8:
						msk = p->mskoff + 
						    (maskY * p->mskpitch) + 
						    (maskX << 2);
						CG14Comp_Over32Solid(p,
						    msk, p->mskpitch,
						    dst, dstpitch,
						    width, height);
						break;
					case 0:
						DPRINTF(X_ERROR, "%s: Over with solid %08x and no mask\n", __func__, p->fillcolour);
						CG14PrepareSolid(pDst, GXcopy, 0xffffffff, p->fillcolour);
						CG14Solid(pDst, dstX, dstY, width, height);
						break;
					default:
						xf86Msg(X_ERROR,
						  "unsupported mask format %08x\n", p->mskformat);
				}
			} else {
				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
				    p->mskformat);
				switch (p->srcformat) {
					case PICT_a8r8g8b8:
					case PICT_a8b8g8r8:
						src = p->srcoff +
						    (srcY * p->srcpitch) +
						    (srcX << 2);
						dst = dstoff +
						    (dstY * dstpitch) +
						    (dstX << 2);
						if (p->mskformat == PICT_a8) {
							msk = p->mskoff + 
							    (maskY * p->mskpitch) +
							    maskX;	
							CG14Comp_Over32Mask(p, 
							    src, p->srcpitch,
							    msk, p->mskpitch,
							    dst, dstpitch,
							    width, height, flip);
						} else {
							CG14Comp_Over32(p, 
							    src, p->srcpitch,
							    dst, dstpitch,
							    width, height, flip);
						}
						break;
					case PICT_x8r8g8b8:
					case PICT_x8b8g8r8:
						src = p->srcoff +
						    (srcY * p->srcpitch) +
						    (srcX << 2);
						dst = dstoff +
						    (dstY * dstpitch) +
						    (dstX << 2);
						if (p->mskformat == PICT_a8) {
							msk = p->mskoff + 
							    (maskY * p->mskpitch) +
							    maskX;	
							CG14Comp_Over32Mask_noalpha(p, 
							    src, p->srcpitch,
							    msk, p->mskpitch,
							    dst, dstpitch,
							    width, height, flip);
						} else if ((p->mskformat == PICT_a8r8g8b8) ||
							   (p->mskformat == PICT_a8b8g8r8)) {
							msk = p->mskoff + 
							    (maskY * p->mskpitch) +
							    (maskX << 2);	
							CG14Comp_Over32Mask32_noalpha(p, 
							    src, p->srcpitch,
							    msk, p->mskpitch,
							    dst, dstpitch,
							    width, height, flip);
						} else {
							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
						}
						break;
					default:
						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
						    __func__, p->srcformat);
				}
			}
			break;
		case PictOpAdd:
			DPRINTF(X_ERROR, "Add %08x %08x\n",
			    p->srcformat, p->dstformat);
			switch (p->srcformat) {
				case PICT_a8:
					src = p->srcoff +
					    (srcY * p->srcpitch) + srcX;
					if (p->dstformat == PICT_a8) {
						dst = dstoff + 
						      (dstY * dstpitch) + dstX;
						CG14Comp_Add8(p,
						    src, p->srcpitch,
						    dst, dstpitch,
						    width, height);
					} else {
						dst = dstoff + 
						      (dstY * dstpitch) +
						      (dstX << 2);
						CG14Comp_Add8_32(p,
						    src, p->srcpitch,
						    dst, dstpitch,
						    width, height);
					}
					break;
				case PICT_a8r8g8b8:
				case PICT_x8r8g8b8:
					src = p->srcoff +
					    (srcY * p->srcpitch) + (srcX << 2);
					dst = dstoff + (dstY * dstpitch) +
					    (dstX << 2);
					CG14Comp_Add32(p, src, p->srcpitch,
					    dst, dstpitch, width, height);
					break;
				default:
					xf86Msg(X_ERROR,
					    "unsupported src format\n");
			}
			break;
		case PictOpSrc:
			DPRINTF(X_ERROR, "Src %08x %08x\n",
			    p->srcformat, p->dstformat);
			if (p->mskformat != 0)
				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
			if (p->srcformat == PICT_a8) {
				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
			} else {
				/* convert between RGB and BGR? */
				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
			}
			break;
		default:
			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
	}
	exaMarkSync(pDst->drawable.pScreen);
}



Bool
CG14InitAccel(ScreenPtr pScreen)
{
	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
	ExaDriverPtr pExa;

	pExa = exaDriverAlloc();
	if (!pExa)
		return FALSE;

	p->pExa = pExa;

	pExa->exa_major = EXA_VERSION_MAJOR;
	pExa->exa_minor = EXA_VERSION_MINOR;

	pExa->memoryBase = p->fb;
	pExa->memorySize = p->memsize;
	pExa->offScreenBase = p->width * p->height * (pScrn->bitsPerPixel >> 3);

	/*
	 * SX memory instructions are written to 64bit aligned addresses with
	 * a 3 bit displacement. Make sure the displacement remains constant
	 * within one column
	 */
	
	pExa->pixmapOffsetAlign = 8;
	pExa->pixmapPitchAlign = 8;

	pExa->flags = EXA_OFFSCREEN_PIXMAPS
		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
		      /*| EXA_MIXED_PIXMAPS*/;

	/*
	 * these limits are bogus
	 * SX doesn't deal with coordinates at all, so there is no limit but
	 * we have to put something here
	 */
	pExa->maxX = 4096;
	pExa->maxY = 4096;

	pExa->WaitMarker = CG14WaitMarker;

	pExa->PrepareSolid = CG14PrepareSolid;
	pExa->Solid = CG14Solid;
	pExa->DoneSolid = CG14DoneCopy;
	pExa->PrepareCopy = CG14PrepareCopy;
	pExa->Copy = CG14Copy32;
	pExa->DoneCopy = CG14DoneCopy;
	if (p->use_xrender) {
		pExa->CheckComposite = CG14CheckComposite;
		pExa->PrepareComposite = CG14PrepareComposite;
		pExa->Composite = CG14Composite;
		pExa->DoneComposite = CG14DoneCopy;
	}

	/* EXA hits more optimized paths when it does not have to fallback 
	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
	 */
	pExa->UploadToScreen = CG14UploadToScreen;
	pExa->DownloadFromScreen = CG14DownloadFromScreen;

	p->queuecount = 0;
	/* do some hardware init */
	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
	p->last_mask = 0xffffffff;
	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
	p->last_rop = 0xcc;
	return exaDriverInit(pScreen, pExa);
}
