/*
 * SiS memcpy() routines (assembly)
 *
 * Copyright (C) 2004-2005 Thomas Winischhofer
 *
 * Idea and some code bits from via_memcpy.c which is
 * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sub license,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdlib.h>
#include "sis.h"

#if 0			/* Debug */
#define SISDGBMC
#endif

extern unsigned int SISAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize);
extern void	    SISFreeFBMemory(ScrnInfoPtr pScrn, void **handle);

#define CPUBUFFERSIZE 2048       /* Size of /proc/cpuinfo buffer */
#define BUFFERSIZE (576 * 1152)  /* Matches 720x576 YUV420 */

/************************************************************************/
/*                   arch specific memcpy() routines                    */
/************************************************************************/

/* i386, AMD64 */

#define FENCE 			\
     __asm__ __volatile__( 	\
		  " sfence\n" 	\
		  :		\
		  :		\
		  : "memory");

#define FENCEMMS 		\
     __asm__ __volatile__ (	\
		  " sfence\n"	\
		  " emms\n"	\
		  :		\
		  :		\
		  : "memory");

#define FEMMS 			\
     __asm__ __volatile__(	\
		  " femms\n"	\
		  :		\
		  :		\
		  : "memory");

#define EMMS 			\
     __asm__ __volatile__(	\
		  " emms\n"	\
		  :		\
		  :		\
		  : "memory");

#define SSE_PREFETCH " prefetchnta "
#define NOW_PREFETCH " prefetch "

#define PREFETCH1(arch_prefetch,from)		\
    __asm__ __volatile__ (			\
		  arch_prefetch "(%0)\n"	\
		  arch_prefetch "32(%0)\n"	\
		  arch_prefetch "64(%0)\n"	\
		  arch_prefetch "96(%0)\n"	\
		  arch_prefetch "128(%0)\n"	\
		  arch_prefetch "160(%0)\n"	\
		  arch_prefetch "192(%0)\n"	\
		  arch_prefetch "256(%0)\n"	\
		  arch_prefetch "288(%0)\n"	\
		  : 				\
		  : "r" (from) );

#define PREFETCH2(arch_prefetch,from)		\
    __asm__ __volatile__ (			\
		  arch_prefetch "320(%0)\n"	\
		  : 				\
		  : "r" (from) );

#define PREFETCH3(arch_prefetch,from)		\
    __asm__ __volatile__ (			\
		  arch_prefetch "288(%0)\n"	\
		  : 				\
		  : "r" (from) );

#define small_memcpy_i386(to,from,n)					\
    {									\
	__asm__ __volatile__(						\
		  " cld\n"						\
		  " shrl $1, %%ecx\n"					\
		  " jnc 1f\n"						\
		  " movsb\n"						\
		"1: shrl $1, %%ecx\n"					\
		  " jnc 2f\n"						\
		  " movsw\n"						\
		"2: rep ; movsl"					\
		  : "=&D" (to), "=&S" (from)				\
		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
		  : "memory", "cc");					\
    }

#define small_memcpy_amd64(to,from,n)					\
    {									\
	__asm__ __volatile__(						\
		  " cld\n"						\
		  " shrq $1, %%rcx\n"					\
		  " jnc 1f\n"						\
		  " movsb\n"						\
		"1: shrq $1, %%rcx\n"					\
		  " jnc 2f\n"						\
		  " movsw\n"						\
		"2: shrq $1, %%rcx\n"					\
		  " jnc 3f\n"						\
		  " movsl\n"						\
		"3: rep ; movsq"					\
		  : "=&D" (to), "=&S" (from)				\
		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
		  : "memory", "cc");					\
    }

#define MMX_CPY(prefetch,from,to,dummy,lcnt)				\
    __asm__ __volatile__ (						\
	        "1:\n"							\
		    prefetch "320(%1)\n"				\
	          " movq (%1), %%mm0\n"					\
		  " movq 8(%1), %%mm1\n"				\
		  " movq 16(%1), %%mm2\n"				\
		  " movq 24(%1), %%mm3\n"				\
		  " movq %%mm0, (%0)\n"					\
		  " movq %%mm1, 8(%0)\n"				\
		  " movq %%mm2, 16(%0)\n"				\
		  " movq %%mm3, 24(%0)\n"				\
		    prefetch "352(%1)\n"				\
		  " movq 32(%1), %%mm0\n"				\
		  " movq 40(%1), %%mm1\n"				\
		  " movq 48(%1), %%mm2\n"				\
		  " movq 56(%1), %%mm3\n"				\
		  " leal 64(%1),%1\n"					\
		  " movq %%mm0, 32(%0)\n"				\
		  " movq %%mm1, 40(%0)\n"				\
		  " movq %%mm2, 48(%0)\n"				\
		  " movq %%mm3, 56(%0)\n"				\
		  " decl %2\n"						\
		  " leal 64(%0),%0\n"					\
		  " jne 1b\n"						\
		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
		  : "0" (to), "1" (from), "2" (lcnt) 			\
		  : "memory", "cc");

#define SSE_CPY(prefetch,from,to,dummy,lcnt)				\
    if((ULong) from & 15) {						\
	__asm__ __volatile__ (						\
		"1:\n"							\
		    prefetch "320(%1)\n"				\
		  " movups (%1), %%xmm0\n"				\
		  " movups 16(%1), %%xmm1\n"				\
		  " movntps %%xmm0, (%0)\n"				\
		  " movntps %%xmm1, 16(%0)\n"				\
		    prefetch "352(%1)\n"				\
		  " movups 32(%1), %%xmm2\n"				\
		  " movups 48(%1), %%xmm3\n"				\
		  " leal 64(%1),%1\n"					\
		  " movntps %%xmm2, 32(%0)\n"				\
		  " movntps %%xmm3, 48(%0)\n"				\
		  " decl %2\n"						\
		  " leal 64(%0),%0\n"					\
		  " jne 1b\n"						\
		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
		  : "0" (to), "1" (from), "2" (lcnt)			\
		  : "memory", "cc"); 					\
    } else {								\
	__asm__ __volatile__ (						\
		"2:\n"							\
		    prefetch "320(%1)\n"				\
		  " movaps (%1), %%xmm0\n"				\
		  " movaps 16(%1), %%xmm1\n"				\
		  " movntps %%xmm0, (%0)\n"				\
		  " movntps %%xmm1, 16(%0)\n"				\
        	    prefetch "352(%1)\n"				\
		  " movaps 32(%1), %%xmm2\n"				\
		  " movaps 48(%1), %%xmm3\n"				\
		  " leal 64(%1),%1\n"					\
		  " movntps %%xmm2, 32(%0)\n"				\
		  " movntps %%xmm3, 48(%0)\n"				\
		  " decl %2\n"						\
		  " leal 64(%0),%0\n"					\
		  " jne 2b\n"						\
		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
		  : "0" (to), "1" (from), "2" (lcnt)			\
		  : "memory", "cc");					\
    }

#define SSE64_CPY(prefetch,from,to,dummy,lcnt)				\
    if((ULong) from & 15) {						\
	__asm__ __volatile__ (						\
		"1:\n"							\
		    prefetch "320(%1)\n"				\
		  " movups (%1), %%xmm0\n"				\
		  " movups 16(%1), %%xmm1\n"				\
		  " movntps %%xmm0, (%0)\n"				\
		  " movntps %%xmm1, 16(%0)\n"				\
		    prefetch "352(%1)\n"				\
		  " movups 32(%1), %%xmm2\n"				\
		  " movups 48(%1), %%xmm3\n"				\
		  " leaq 64(%1),%1\n"					\
		  " movntps %%xmm2, 32(%0)\n"				\
		  " movntps %%xmm3, 48(%0)\n"				\
		  " decl %2\n"						\
		  " leaq 64(%0),%0\n"					\
		  " jne 1b\n"						\
		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
		  : "0" (to), "1" (from), "2" (lcnt)			\
		  : "memory", "cc"); 					\
    } else {								\
	__asm__ __volatile__ (						\
		"2:\n"							\
		    prefetch "320(%1)\n"				\
		  " movaps (%1), %%xmm0\n"				\
		  " movaps 16(%1), %%xmm1\n"				\
		  " movntps %%xmm0, (%0)\n"				\
		  " movntps %%xmm1, 16(%0)\n"				\
        	    prefetch "352(%1)\n"				\
		  " movaps 32(%1), %%xmm2\n"				\
		  " movaps 48(%1), %%xmm3\n"				\
		  " leaq 64(%1),%1\n"					\
		  " movntps %%xmm2, 32(%0)\n"				\
		  " movntps %%xmm3, 48(%0)\n"				\
		  " decl %2\n"						\
		  " leaq 64(%0),%0\n"					\
		  " jne 2b\n"						\
		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
		  : "0" (to), "1" (from), "2" (lcnt)			\
		  : "memory", "cc");					\
    }

#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt)				\
    __asm__ __volatile__ (						\
		  ".p2align 4,,7\n"					\
		 "1:\n"							\
		    prefetch "320(%1)\n"				\
		  " movq (%1), %%mm0\n"					\
		  " movq 8(%1), %%mm1\n"				\
		  " movq 16(%1), %%mm2\n"				\
		  " movq 24(%1), %%mm3\n"				\
		  " movntq %%mm0, (%0)\n"				\
		  " movntq %%mm1, 8(%0)\n"				\
		  " movntq %%mm2, 16(%0)\n"				\
		  " movntq %%mm3, 24(%0)\n"				\
		    prefetch "352(%1)\n"				\
		  " movq 32(%1), %%mm0\n"				\
		  " movq 40(%1), %%mm1\n"				\
		  " movq 48(%1), %%mm2\n"				\
		  " movq 56(%1), %%mm3\n"				\
		  " leal 64(%1),%1\n"					\
		  " movntq %%mm0, 32(%0)\n"				\
		  " movntq %%mm1, 40(%0)\n"				\
		  " movntq %%mm2, 48(%0)\n"				\
		  " movntq %%mm3, 56(%0)\n"				\
		  " decl %2\n"						\
		  " leal 64(%0),%0\n"					\
		  " jne 1b\n"						\
		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
		  : "0" (to), "1" (from), "2" (lcnt) 			\
		  : "memory", "cc");


#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)		\
									\
    static void prefix##_memcpy(UChar *to,				\
				const UChar *from,			\
				int size)				\
    {									\
	int lcnt = size >> 6;						\
	int rest = size & 63;						\
	register int dummy;						\
									\
	PREFETCH1(ptype##_PREFETCH,from);				\
									\
	begin;								\
	if(lcnt) {							\
	   itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt);		\
	}								\
	if(rest) {							\
	   PREFETCH2(ptype##_PREFETCH,from);				\
	   small(to, from, rest);					\
	   PREFETCH3(ptype##_PREFETCH,from);				\
	}								\
	fence;								\
    }

#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small)			\
									\
    static void prefix##_memcpy(UChar *to,				\
				const UChar *from,			\
				int size)				\
    {									\
	int lcnt = size >> 6;						\
	int rest = size & 63;						\
	register int dummy;						\
									\
	begin;								\
	if(lcnt) {							\
	   itype##_CPY("#",from,to,dummy,lcnt);				\
	}								\
	if(rest) {							\
	   small(to, from, rest);					\
	}								\
	fence;								\
    }

/* Other archs */

/* ... */


/* Type for table for benchmark list */

typedef struct {
    vidCopyFunc  mFunc;
    char         *mName;
    unsigned int mycpuflag;
    int          grade;
    int 	 gradefrom;
    Bool         reqAlignment;
} SISMCFuncData;

/************************************************************************/
/*                   libc memcpy() wrapper - generic                    */
/************************************************************************/

static void SiS_libc_memcpy(UChar *dst, const UChar *src, int size)
{
    memcpy(dst, src, size);
}

/************************************************************************/
/* We only do all that stuff under gcc; no idea what other compilers 	*/
/* would do with our asm code.  					*/
/************************************************************************/

#ifndef __GNUC__

unsigned int SiSGetCPUFlags(ScrnInfoPtr pScrn)
{
    return 0;
}

vidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
{
    *UMemCpy = SiS_libc_memcpy;
    return SiS_libc_memcpy;
}

vidCopyFunc SiSVidCopyGetDefault(void)
{
    return SiS_libc_memcpy;
}

#else /* ! Everything below is gcc specific ! */

/************************************************************************/
/*                    Definitions for archs and OSes                    */
/************************************************************************/

#undef SiS_checkosforsse
#undef SiS_canBenchmark
#undef SiS_haveProc
#undef SiS_haveBuiltInMC

#if defined(__i386__) /* ***************************************** i386 */

#define SiS_checkosforsse 	/* Does this cpu support sse and do we need to check os? */
#define SiS_canBenchmark	/* Can we perform a benchmark? */
#ifdef SIS_LINUX
#define SiS_haveProc		/* Do we have /proc/cpuinfo or similar? */
#endif
#define SiS_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */

/* Built-in memcpy for i386 */
static __inline void * builtin_memcpy(void * to, const void * from, size_t n)
{
    int d1,d2,d3;

    __asm__ __volatile__(
		  " cld\n"
		  " shrl $1, %%ecx\n"
		  " jnc 1f\n"
		  " movsb\n"
		"1: shrl $1, %%ecx\n"
		  " jnc 2f\n"
		  " movsw\n"
		"2: rep ; movsl\n"
		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
		  : "0" (n), "1" ((long) to), "2" ((long) from)
		  : "memory", "cc");

    return(to);
}

/* Alternative for 586: Unroll loop, copy 32 bytes at a time */
static void SiS_builtin_memcp2(UChar *to, const UChar *from, int n)
{
    int d1,d2,d3;

    __asm__ __volatile__(
		  " movl %%edi, %%eax\n"
		  " cmpl $32, %%ecx\n"
		  " cld\n"
		  " jbe 3f\n"
		  " negl %%eax\n"		/* Align dest */
		  " andl $3, %%eax\n"
		  " subl %%eax, %%ecx\n"
		  " xchgl %%eax, %%ecx\n"
		  " rep ; movsb\n"
		  " movl %%eax, %%ecx\n"
		  " subl $32, %%ecx\n"
		  " js 2f\n"
		  " movl (%%edi), %%eax\n"
		"1: movl 28(%%edi), %%edx\n"   	/* Trick: Read-ahead */
		  " subl $32, %%ecx\n"
		  " movl (%%esi), %%eax\n"
		  " movl 4(%%esi), %%edx\n"
		  " movl %%eax, (%%edi)\n"
		  " movl %%edx, 4(%%edi)\n"
		  " movl 8(%%esi), %%eax\n"
		  " movl 12(%%esi), %%edx\n"
		  " movl %%eax, 8(%%edi)\n"
		  " movl %%edx, 12(%%edi)\n"
		  " movl 16(%%esi), %%eax\n"
		  " movl 20(%%esi), %%edx\n"
		  " movl %%eax, 16(%%edi)\n"
		  " movl %%edx, 20(%%edi)\n"
		  " movl 24(%%esi), %%eax\n"
		  " movl 28(%%esi), %%edx\n"
		  " movl %%eax, 24(%%edi)\n"
		  " movl %%edx, 28(%%edi)\n"
		  " leal 32(%%esi), %%esi\n"
		  " leal 32(%%edi), %%edi\n"
		  " jns 1b\n"
		"2: addl $32, %%ecx\n"
		"3: rep ; movsb"
		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
		  : "0" (n), "1" ((long) to), "2" ((long) from)
		  : "eax", "edx", "memory", "cc");

}

static unsigned int taketime(void)	/* get current time (for benchmarking) */
{
    unsigned int eax;

    __asm__ volatile (
		" pushl %%ebx\n"
		" cpuid\n"
		" rdtsc\n"
		" popl %%ebx\n"
		: "=a" (eax)
		: "0" (0)
		: "ecx", "edx", "cc");

    return(eax);
}

#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */

#define SiS_checkosforsse	/* Does this cpu support sse and do we need to check os? */
#define SiS_canBenchmark	/* Can we perform a benchmark? */
#ifdef SIS_LINUX
#define SiS_haveProc		/* Do we have /proc/cpuinfo or similar? */
#endif
#define SiS_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */

/* Built-in memcpy for AMD64 */
static __inline void * builtin_memcpy(void * to, const void * from, int n)
{
    long d1, d2, d3;

    __asm__ __volatile__ (
		" cld\n"
		" rep ; movsq\n"
		" movq %4, %%rcx\n"
		" rep ; movsb"
		: "=%c" (d1), "=&D" (d2), "=&S" (d3)
		: "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)),
		  "1" ((long) to), "2" ((long) from)
		: "memory");

    return(to);
}

/* Alternative: Unroll loop, copy 32 bytes at a time */
static void SiS_builtin_memcp2(UChar *to, const UChar *from, int n)
{
    long d1,d2,d3;

    __asm__ __volatile__(
		  " movq %%rdi, %%rax\n"
		  " cmpq $32, %%rcx\n"
		  " cld\n"			/* Pipeline; no other flags but DF */
		  " jbe 1f\n"
		  " negq %%rax\n"		/* Align dest */
		  " andq $7, %%rax\n"
		  " subq %%rax, %%rcx\n"
		  " xchgq %%rax, %%rcx\n"
		  " rep ; movsb\n"
		  " movq %%rax, %%rcx\n"
		  " subq $32, %%rcx\n"
		  " js 2f\n"
		  ".p2align 4\n"
		"3: subq $32, %%rcx\n"
		  " movq (%%rsi), %%rax\n"
		  " movq 8(%%rsi), %%rdx\n"
		  " movq 16(%%rsi), %%r8\n"
		  " movq 24(%%rsi), %%r9\n"
		  " movq %%rax, (%%rdi)\n"
		  " movq %%rdx, 8(%%rdi)\n"
		  " movq %%r8, 16(%%rdi)\n"
		  " movq %%r9, 24(%%rdi)\n"
		  " leaq 32(%%rsi), %%rsi\n"
		  " leaq 32(%%rdi), %%rdi\n"
		  " jns 3b\n"
		"2: addq $32, %%rcx\n"
		"1: rep ; movsb"
		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
		  :"0" ((ULong) n), "1" ((long) to), "2" ((long) from)
		  : "rax", "rdx", "r8", "r9", "memory", "cc");

}

static unsigned int taketime(void)	/* get current time (for benchmarking) */
{
    unsigned int eax;

    __asm__ volatile (
		" pushq %%rbx\n"
		" cpuid\n"
		" rdtsc\n"
		" popq %%rbx\n"
		: "=a" (eax)
		: "0" (0)
		: "rcx", "rdx", "cc");

    return(eax);
}

#else		/* **************************************** Other archs */

/* 1. Can we do a benchmark?		*/
/* #define SiS_canBenchmark		*/

/* 2. Do we have /proc filesystem or similar for CPU information? */
/* #define SiS_haveproc			*/

/* 3. Optional: build-in memcpy()	*/
/* #define SiS_haveBuiltInMC		*/
/* static __inline void * builtin_memcpy(void * to, const void * from, int n)
   {
   }
*/

/* 4. Function for getting current time (for benchmarking)  */
/* static unsigned int taketime(void)
   {
   }
*/

#endif

/************************************************************************/
/*                   Generic built-in memcpy wrapper                    */
/************************************************************************/

#ifdef SiS_haveBuiltInMC
static void SiS_builtin_memcpy(UChar *dst, const UChar *src, int size)
{
    builtin_memcpy(dst, src, size);
}
#endif

/************************************************************************/
/* Generic routines if Benchmark can be performed (all archs, all OSes) */
/************************************************************************/

#ifdef SiS_canBenchmark

/* Get time (unsigned int) */
static unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size)
{
    unsigned int t1, t2;

    t1 = taketime();

    (*mf)(buf1, buf2, size);

    t2 = taketime();

    return((t1 <  t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1));
}

/* Allocate an area of offscreen FB memory (buf1), a simulated video
 * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
 */
static void *
SiS_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3)
{
    SISPtr pSiS = SISPTR(pScrn);
    unsigned int offset;
    void *handle = NULL;

    if(!(offset = SISAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) {
       return NULL;
    }
    (*buf1) = (UChar *)pSiS->FbBase + offset;
    (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31);

    if(!((*buf2) = (UChar *)malloc(BUFFERSIZE + 15))) {
       SISFreeFBMemory(pScrn, &handle);
       return NULL;
    }

    if(!((*buf3) = (UChar *)malloc(BUFFERSIZE + 15))) {
       free((*buf2));
       SISFreeFBMemory(pScrn, &handle);
       return NULL;
    }

    return handle;
}

/* Perform Benchmark */
static int SiS_BenchmarkMemcpy(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions,
                               unsigned int myCPUflags, UChar *buf1, UChar *buf2,
			       UChar *buf3, char *frqBuf, double cpuFreq,
			       vidCopyFunc *UMemCpy, int *best2, Bool from)
{
    SISMCFuncData *curData;
    int j = 0, bestSoFar = 0;
    unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU;

    (*best2) = 0;

    /* Make probable buf1 and buf2 are not paged out by referencing them */
    SiS_libc_memcpy(buf1, buf2, BUFFERSIZE);

    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
	       "Benchmarking %s RAM to %s RAM memory transfer methods:\n",
	       from ? "video" : "system",
	       from ? "system" : "video");

#ifdef TWDEBUG
    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags);
#endif

    j = 0;
    while(MCFunctions[j].mFunc) {

	curData = MCFunctions + j;

	if(myCPUflags & curData->mycpuflag) {

	   /* Simulate setup of the video buffer and copy result to framebuffer */
	   /* Do this 4 times to verify results */
	   if(!from) {
	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
	      tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
	   } else {
	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
	      tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
	   }

	   if((!frqBuf) || (tmp1 == 0)) {
	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
			   "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1);
	   } else {
	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
			   "\tChecked %s memcpy()... \t%.1f MiB/s\n",
			   curData->mName,
			   cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000)));
	   }

	   if(tmp1 < best) {
	      best = tmp1;
	      bestSoFar = j;
	   }

	   if(!curData->reqAlignment) {
	      if(tmp1 < sbest) {
	         sbest = tmp1;
	         (*best2) = j;
	      }
	   }

	}

	j++;
    }

    return bestSoFar;
}

static vidCopyFunc SiS_GetBestByGrade(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions,
			unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from)
{
    int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10;
    int grade;

    *UMemCpy = SiS_libc_memcpy;

    while(MCFunctions[j].mFunc) {
	if(myCPUflags & MCFunctions[j].mycpuflag) {
	   grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade;
	   if(grade < bestSoFar) {
	      best = j;
	      bestSoFar = grade;
	   }
	   if(grade < best2SoFar) {
	      if(!MCFunctions[j].reqAlignment) {
	         secondbest = j;
		 best2SoFar = grade;
	      }
	   }
	}
	j++;
    }
    if(best >= 0) {
       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
		"Chose %s method for aligned data transfers %s video RAM\n",
		MCFunctions[best].mName,
		from ? "from" : "to");
       if(secondbest >= 0) {
          xf86DrvMsg(pScrn->scrnIndex, X_INFO,
		"Chose %s method for unaligned data transfers %s video RAM\n",
		   MCFunctions[secondbest].mName,
		   from ? "from" : "to");
          *UMemCpy = MCFunctions[secondbest].mFunc;
       }
       return MCFunctions[best].mFunc;
    }

    return SiS_libc_memcpy;
}
#endif /* canBenchmark */

/**********************************************************************/
/*      Generic routines if /proc filesystem is available (Linux)     */
/**********************************************************************/

#ifdef SiS_haveProc
/* Linux: Read file (/proc/cpuinfo) into buffer */
static int SiS_ReadProc(char *buf, char *filename)
{
    FILE *cpuInfoFile;
    int count;

    if((cpuInfoFile = fopen(filename, "r")) == NULL) {
       return 0;
    }

    count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile);
    if(ferror(cpuInfoFile)) {
       fclose(cpuInfoFile);
       return 0;
    }

    fclose(cpuInfoFile);

    if(count >= CPUBUFFERSIZE - 2) {
       return 0;
    }

    buf[count] = 0;

    return count;
}

/* Linux: Extract CPU speed from /proc/cpuinfo */
static char *SiS_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq)
{
    char *frqBuf, *endBuf;

    (*cpuFreq) = 0.0;

    if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) {
       frqBuf += 11;
       (*cpuFreq) = strtod(frqBuf, &endBuf);
       if(endBuf == frqBuf) frqBuf = NULL;
       if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */
       if(frqBuf) {
          xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq));
       }
    }

    return frqBuf;
}
#endif /* haveProc */

/**********************************************************************/
/*                      Arch-specific routines                        */
/**********************************************************************/

#ifdef SiS_checkosforsse   /* Common i386, AMD64  */

#ifdef SISCHECKOSSSE

#ifndef XFree86LOADER
#include <setjmp.h>
#endif

static jmp_buf sigill_return;

static void sigill_handler(void)
{
    longjmp(sigill_return, 1);
}
#endif

static Bool CheckOSforSSE(ScrnInfoPtr pScrn)
{
#ifdef SISCHECKOSSSE  /* Check OS for SSE possible: */
    int signo = -1;

#ifdef SISDGBMC
    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n");
#endif

    xf86InterceptSigIll(&sigill_handler);

    if(setjmp(sigill_return)) {
       signo = 4;
    } else {
       __asm__ __volatile__ (" xorps %xmm0, %xmm0\n");
       /* __asm__ __volatile__ (" .byte 0xff\n"); */  /* For test */
    }

    xf86InterceptSigIll(NULL);

#ifdef SISDGBMC
    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo);
#endif

    if(signo != -1) {
       xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
		"OS does not support SSE instructions\n");
    }

    return (signo >= 0) ? FALSE : TRUE;

#else  /* no check for SSE possible: */

    SISPtr pSiS = SISPTR(pScrn);

    xf86DrvMsg(pScrn->scrnIndex, pSiS->XvSSEMemcpy ? X_WARNING : X_INFO,
	"Checking OS for SSE support is not supported in this version of " SISMYSERVERNAME "\n");

    if(pSiS->XvSSEMemcpy) {
       xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
	"If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n");
       return TRUE;
    } else {
       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
	"If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n");
       return FALSE;
    }
#endif
}

#endif /* SiS_checkosforsse */

#ifdef __i386__   /* i386 specific *************************************/

PREFETCH_FUNC(SiS_sse,SSE,SSE,,FENCE,small_memcpy_i386)
PREFETCH_FUNC(SiS_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386)
PREFETCH_FUNC(SiS_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386)
NOPREFETCH_FUNC(SiS_mmx,MMX,EMMS,EMMS,small_memcpy_i386)

static SISMCFuncData MCFunctions_i386[] = {
    {SiS_libc_memcpy,   "libc",      SIS_CPUFL_LIBC,  4,  4, FALSE},
    {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI,    5,  5, FALSE},
    {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2,   6,  6, FALSE},
    {SiS_mmx_memcpy,    "MMX",       SIS_CPUFL_MMX,   3,  3, FALSE},
    {SiS_sse_memcpy,    "SSE",       SIS_CPUFL_SSE,   1,  0, TRUE},
    {SiS_now_memcpy,    "3DNow!",    SIS_CPUFL_3DNOW, 2,  2, FALSE},
    {SiS_mmxext_memcpy, "MMX2",      SIS_CPUFL_MMX2,  0,  1, FALSE},
    {NULL,              "",          0,              10, 10, FALSE}
};

#define Def_FL  (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2)  /* Default methods */

#define cpuid(op, eax, ebx, ecx, edx) 		\
    __asm__ __volatile__ (			\
		" pushl %%ebx\n"		\
		" cpuid\n"			\
		" movl %%ebx, %1\n"		\
		" popl %%ebx\n"			\
		: "=a" (eax), "=r" (ebx), 	\
		  "=c" (ecx), "=d" (edx)	\
		: "a" (op)			\
		: "cc")

static Bool cpuIDSupported(ScrnInfoPtr pScrn)
{
    int eax, ebx, ecx, edx;

    /* Check for cpuid instruction */
    __asm__ __volatile__ (
		" pushf\n"
		" popl %0\n"
		" movl %0, %1\n"
		" xorl $0x200000, %0\n"
		" push %0\n"
		" popf\n"
		" pushf\n"
		" popl %0\n"
		: "=a" (eax), "=c" (ecx)
		:
		: "cc");

    if(eax == ecx) {
       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n");
       return FALSE;
    }

    /* Check for cpuid level */
    cpuid(0x00000000, eax, ebx, ecx, edx);
    if(!eax) {
       return FALSE;
    }

    /* Check for RDTSC */
    cpuid(0x00000001, eax, ebx, ecx, edx);

    if(!(edx & 0x10)) {
       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n");
       return FALSE;
    }

    return TRUE;
}

static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
{
    unsigned int flags = 0, eax, ebx, ecx, edx;
    Bool IsAMD;

    /* Check if cpuid and rdtsc instructions are supported */
    if(!cpuIDSupported(pScrn)) {
       return 0;
    }

    cpuid(0x00000000, eax, ebx, ecx, edx);

    IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163);

    cpuid(0x00000001, eax, ebx, ecx, edx);
    /* MMX */
    if(edx & 0x00800000) flags |= SIS_CPUFL_MMX;
    /* SSE, MMXEXT */
    if(edx & 0x02000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_MMX2);
    /* SSE2 - don't need this one directly, set SSE instead */
    if(edx & 0x04000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_SSE2);

    cpuid(0x80000000, eax, ebx, ecx, edx);
    if(eax >= 0x80000001) {
       cpuid(0x80000001, eax, ebx, ecx, edx);
       /* 3DNow! */
       if(edx & 0x80000000) flags |= SIS_CPUFL_3DNOW;
       /* AMD MMXEXT */
       if(IsAMD && (edx & 0x00400000)) flags |= SIS_CPUFL_MMX2;
    }

    return flags;
}

#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */

PREFETCH_FUNC(SiS_sse,SSE64,SSE,,FENCE,small_memcpy_amd64)

static SISMCFuncData MCFunctions_AMD64[] = {
    {SiS_libc_memcpy,   "libc",      SIS_CPUFL_LIBC, 2,  2, FALSE},
    {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI,   1,  1, FALSE},
    {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2,  3,  3, FALSE},
    {SiS_sse_memcpy,    "SSE",       SIS_CPUFL_SSE,  0,  0, TRUE},
    {NULL,              "",          0,             10, 10, FALSE}
};

#define Def_FL  (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2)

static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
{
    return((unsigned int)(SIS_CPUFL_SSE|SIS_CPUFL_SSE2));
}

#else  /* Specific for other archs ******************************** */

/* Fill in here */

#define Def_FL  (SIS_CPUFL_LIBC)

static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
{
    return((unsigned int)(0));
}

#endif

/**********************************************************************/
/*     Benchmark the video copy routines and choose the fastest       */
/**********************************************************************/

#ifdef SiS_canBenchmark
static vidCopyFunc
SiSVidCopyInitGen(ScreenPtr pScreen, SISMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from)
{
    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
    SISPtr pSiS = SISPTR(pScrn);
    void *fbhandle = NULL;
    char  *frqBuf = NULL;
    UChar *buf1, *buf2, *buf3;
    double cpuFreq = 0.0;
    unsigned int myCPUflags = pSiS->CPUFlags | Def_FL;
    int best, secondbest;
#ifdef SiS_haveProc
    char buf[CPUBUFFERSIZE];
#endif

    *UMemCpy = SiS_libc_memcpy;

    /* Bail out if user disabled benchmarking */
    if(!pSiS->BenchMemCpy) {
       return SiS_libc_memcpy;
    }

#ifdef SiS_haveProc
    /* Read /proc/cpuinfo into buf */
    if(SiS_ReadProc(buf, "/proc/cpuinfo")) {

       /* Extract CPU frequency */
       frqBuf = SiS_GetCPUFreq(pScrn, buf, &cpuFreq);

    }
#endif

    /* Allocate buffers */
    if(!(fbhandle = SiS_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) {
       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
       		"Failed to allocate video RAM for video data transfer benchmark\n");
       return SiS_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from);
    }

    /* Perform Benchmark */
    best = SiS_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1,
    				(UChar *)(((unsigned long)buf2 + 15) & ~15),
				(UChar *)(((unsigned long)buf3 + 15) & ~15),
				frqBuf, cpuFreq, UMemCpy, &secondbest, from);

    /* Free buffers */
    SISFreeFBMemory(pScrn, &fbhandle);
    free(buf2);
    free(buf3);

    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
	       "Using %s method for aligned data transfers %s video RAM\n",
	       MCFunctions[best].mName,
	       from ? "from" : "to");

    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
	       "Using %s method for unaligned data transfers %s video RAM\n",
	       MCFunctions[secondbest].mName,
	       from ? "from" : "to");

    return MCFunctions[best].mFunc;
}
#endif /* canBenchmark */

/**********************************************************************/
/* 		       main(): Get CPU capabilities		      */
/* 			    (called externally)			      */
/**********************************************************************/

unsigned int
SiSGetCPUFlags(ScrnInfoPtr pScrn)
{
    unsigned int myCPUflags = SiS_GetCpuFeatures(pScrn);

#ifdef SiS_checkosforsse
    if(myCPUflags & (SIS_CPUFL_SSE | SIS_CPUFL_SSE2)) {

       /* Check if OS supports usage of SSE instructions */
       if(!(CheckOSforSSE(pScrn))) {
          myCPUflags &= ~(SIS_CPUFL_SSE | SIS_CPUFL_SSE2);
       }

    }
#endif

    return myCPUflags;
}

/**********************************************************************/
/*                       main(): SiSVidCopyInit()                     */
/*			    (called externally)			      */
/*		(SiSGetCPUFlags must be called before this one)       */
/**********************************************************************/

vidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
{
#if defined(__i386__) && defined(SiS_canBenchmark)
    return(SiSVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from));
#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(SiS_canBenchmark)
    return(SiSVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from));
#else /* Other cases: Use libc memcpy() */
    *UMemCpy = SiS_libc_memcpy;
    return SiS_libc_memcpy;
#endif
}

vidCopyFunc SiSVidCopyGetDefault(void)
{
    return SiS_libc_memcpy;
}

#endif /* GNU C */


