/* $Id: mach64dma.c,v 1.50 2001/04/09 05:20:29 svartalf Exp $ */

/*
 * GLX Hardware Device Driver for ATI Rage Pro
 * Copyright (C) 1999 Gareth Hughes
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Based on MGA driver: mgadma.c ???
 *
 *    Gareth Hughes <gareth@precisioninsight.com>
 */

#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>

#include "glx_symbols.h"
#include "mach64glx.h"
#include "pb.h"
#include "hw_mtrr.h"

static void mach64FlushPseudoDma( void );
int mach64WaitForDmaCompletion( void );

static	hwUI32	mach64RegistersLocked;		/* true when the registers are memory protected */


// PPC will need to byte swap things into the dma buffer
unsigned _SWAP( unsigned a ) 
{
	return ( ( a & 255 ) << 24 )
		| ( ( a >> 8 ) & 255 ) << 16
		| ( ( a >> 16 ) & 255 ) << 8
 		| ( a >> 24 );
}


//
//	Lock out anybody from writing to our register aperature during DMA operations...
//	(Pilfered from MGA's DMA code by Frank Earl (08-17-00))
//
static void mach64UnlockRegisters( void ) {
	// Unlock access to everything...
	mprotect( mach64glx.MMIOBase, 0x1000, PROT_READ | PROT_WRITE );
	mach64RegistersLocked = 0;
}


#if defined(__i386__) && defined(__linux__)
static void mach64RegisterAccessSignalHandler( int signal, struct sigcontext sc) 
#else
static void mach64RegisterAccessSignalHandler( int signal ) 
#endif  
{

// FIXME -- Is there any way for someone to do this under Alpha/PPC?
#if defined(__i386__) && defined(__linux__)
	if (((void *)sc.cr2 < (void *) mach64glx.MMIOBase )||
	   ((void *)sc.cr2 > (void *) mach64glx.MMIOBase + 0x1000))
	{
		/* Oops, a real segmentation fault. This might be a good place
		 * to set a breakpoint */
		FatalError("Segmentation fault!\n");
	}
#endif

	if ( !mach64RegistersLocked ) 
	{
		hwMsg( 10, "RegisterAccessSignalHandler() without registersLocked\n" );
		FatalError("RegisterAccessSignalHandler() without registersLocked\n");
	}
		
	/* someone has tried to access hardware registers, so make
	   sure dma is completed */
	hwMsg( 10, "RegisterAccessSignalHandler()\n" );
	mach64WaitForDmaCompletion();
	hwMsg( 10, "Leaving RASH()\n" );
}

static void mach64LockRegisters( void ) {
	/* cause a SIGSEGV if the X server tries to write a hardware register */
	mprotect( mach64glx.MMIOBase, 0x1000, PROT_READ );
#ifndef __FreeBSD__
	signal( SIGSEGV, (void (*)(int))mach64RegisterAccessSignalHandler );
#else
	signal( SIGBUS, mach64RegisterAccessSignalHandler );
#endif
	mach64RegistersLocked = 1;
}


	
/*
 * mach64EngineReset
 * If the FIFO has been locked due to improper FIFO discipline, the FIFO and
 * draw engine must be reset before continuing.  Hopefully will allow X to
 * exit gracefully or be killed.
 */
void mach64EngineReset( void )
{
	hwMsg( 1, "macg64EngineReset: Ensuring Bus Mastering is turned off\n");
	
	// Kill off bus mastering with extreme predjudice...
	OUTREG( MACH64_BUS_CNTL, SWAP( SWAP( INREG( MACH64_BUS_CNTL) ) | BUS_MASTER_DIS ) );

	hwMsg( 1, "mach64EngineReset: clearing FIFO errors\n" );

	// Reset engine -- This is accomplished by setting bit 8 of the GEN_TEST_CNTL
	// register high, then low (per the documentation, it's on high to low transition
	// that the GUI engine gets reset...)
	OUTREG( MACH64_GEN_TEST_CNTL, SWAP( SWAP( INREG( MACH64_GEN_TEST_CNTL ) ) | GUI_ENGINE_ENABLE ) );
	OUTREG( MACH64_GEN_TEST_CNTL, SWAP( SWAP( INREG( MACH64_GEN_TEST_CNTL ) ) & ~GUI_ENGINE_ENABLE ) );
}


/*
 * mach64DumpEngineState
 * If we are having problems, dump some information that might be of use
 * to the developers.
 */
void mach64DumpEngineState( void )
{
	int		reg;

	hwError( "\n" );
	hwError( "mach64DumpEngineState:\n" );
	hwError( "\n" );

	/* bus status registers */
	reg = SWAP( INREG( MACH64_BUS_CNTL ) );
	hwError( "           BUS_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BUS_CNTL>>2] );
	hwError( "\n" );

	/* draw engine registers */
	reg = SWAP( INREG( MACH64_DST_CNTL ) );
	hwError( "           DST_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_DST_CNTL>>2] );
	reg = SWAP( INREG( MACH64_DST_OFF_PITCH ) );
	hwError( "      DST_OFF_PITCH current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_DST_OFF_PITCH>>2] );
	reg = SWAP( INREG( MACH64_Z_OFF_PITCH ) );
	hwError( "        Z_OFF_PITCH current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_Z_OFF_PITCH>>2] );
	reg = SWAP( INREG( MACH64_Z_CNTL ) );
	hwError( "             Z_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_Z_CNTL>>2] );
	reg = SWAP( INREG( MACH64_ALPHA_TST_CNTL ) );
	hwError( "     ALPHA_TST_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_ALPHA_TST_CNTL>>2] );
	reg = SWAP( INREG( MACH64_SRC_CNTL ) );
	hwError( "           SRC_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_SRC_CNTL>>2] );
	reg = SWAP( INREG( MACH64_SRC_OFF_PITCH ) );
	hwError( "      SRC_OFF_PITCH current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_SRC_OFF_PITCH>>2] );
	reg = SWAP( INREG( MACH64_DP_PIX_WIDTH ) );
	hwError( "       DP_PIX_WIDTH current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_DP_PIX_WIDTH>>2] );
	reg = SWAP( INREG( MACH64_DP_SRC ) );
	hwError( "             DP_SRC current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_DP_SRC>>2] );
	hwError( "\n" );

	/* engine status registers */
	reg = SWAP( INREG( MACH64_FIFO_STAT ) );
	hwError( "          FIFO_STAT current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_FIFO_STAT>>2] );
	reg = SWAP( INREG( MACH64_GUI_CMDFIFO_DEBUG ) );
	hwError( "  GUI_CMDFIFO_DEBUG current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_GUI_CMDFIFO_DEBUG>>2] );
	reg = SWAP( INREG( MACH64_GUI_CMDFIFO_DATA ) );
	hwError( "   GUI_CMDFIFO_DATA current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_GUI_CMDFIFO_DATA>>2] );
	reg = SWAP( INREG( MACH64_GUI_CNTL ) );
	hwError( "           GUI_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_GUI_CNTL>>2] );
	reg = SWAP( INREG( MACH64_GUI_TRAJ_CNTL ) );
	hwError( "      GUI_TRAJ_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_GUI_TRAJ_CNTL>>2] );
	reg = SWAP( INREG( MACH64_GUI_STAT ) );
	hwError( "           GUI_STAT current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_GUI_STAT>>2] );
	hwError( "\n" );

	/* setup engine registers */
	reg = SWAP( INREG( MACH64_SCALE_3D_CNTL ) );
	hwError( "      SCALE_3D_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_SCALE_3D_CNTL>>2] );
	reg = SWAP( INREG( MACH64_TEX_SIZE_PITCH ) );
	hwError( "     TEX_SIZE_PITCH current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_TEX_SIZE_PITCH>>2] );
	reg = SWAP( INREG( MACH64_TEX_CNTL ) );
	hwError( "           TEX_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_TEX_CNTL>>2] );
	reg = SWAP( INREG( MACH64_SETUP_CNTL ) );
	hwError( "         SETUP_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_SETUP_CNTL>>2] );
	hwError( "\n" );

	/* bus mastering registers */
	reg = SWAP( INREG( MACH64_BM_FRAME_BUF_OFFSET ) );
	hwError( "BM_FRAME_BUF_OFFSET current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_FRAME_BUF_OFFSET>>2] );
	reg = SWAP( INREG( MACH64_BM_SYSTEM_MEM_ADDR ) );
	hwError( " BM_SYSTEM_MEM_ADDR current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_SYSTEM_MEM_ADDR>>2] );
	reg = SWAP( INREG( MACH64_BM_COMMAND ) );
	hwError( "         BM_COMMAND current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_COMMAND>>2] );
	reg = SWAP( INREG( MACH64_BM_STATUS ) );
	hwError( "          BM_STATUS current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_STATUS>>2] );
	reg = SWAP( INREG( MACH64_BM_SYSTEM_TABLE ) );
	hwError( "    BM_SYSTEM_TABLE current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_SYSTEM_TABLE>>2] );
	reg = SWAP( INREG( MACH64_BM_HOSTDATA ) );
	hwError( "        BM_HOSTDATA current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_HOSTDATA>>2] );
	reg = SWAP( INREG( MACH64_BM_ADDR ) );
	hwError( "    BM_ADDR/BM_DATA current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_ADDR>>2] );
	reg = SWAP( INREG( MACH64_BM_GUI_TABLE ) );
	hwError( "       BM_GUI_TABLE current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_GUI_TABLE>>2] );
	reg = SWAP( INREG( MACH64_BM_GUI_TABLE_CMD ) );
	hwError( "   BM_GUI_TABLE_CMD current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_BM_GUI_TABLE_CMD>>2] );
	hwError( "\n" );

	/* agp registers */
	reg = SWAP( INREG( MACH64_AGP_BASE ) );
	hwError( "           AGP_BASE current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_AGP_BASE>>2] );
	reg = SWAP( INREG( MACH64_AGP_CNTL ) );
	hwError( "           AGP_CNTL current = 0x%08x default = 0x%08x\n", reg, mach64glx.registers[MACH64_AGP_CNTL>>2] );
	hwError( "\n" );
}



static void delay( void ) 
{
/* just to prevent an optimizing compiler from removing a spin loop */
}

/*
 * mach64WaitForDmaCompletion
 * Has a timeout so we can hopefully dump out of X instead of just hanging
 */
#define	TIMEOUT_USEC		1000000

int mach64WaitForDmaCompletion( void )
{
	int	startTime;
	int	curTime;
	int	guiStat;
	int	i;

	if ( !mach64glx.dmaActive ) 
	{
		return 0;
	}
	
	if ( mach64glx.skipDma ) 
	{
		return 0;
	}
	startTime = 0;
	curTime = 0;

	while ( 1 )
	{
		guiStat = SWAP( INREG( MACH64_GUI_STAT ) );

		if ( !( guiStat & ENGINE_BUSY ) ) 
		{
			break;
		}

		curTime = usec();
		if ( ( startTime == 0 ) || ( curTime < startTime /*wrap case*/ ) ) {
			startTime = curTime;
		} else if ( curTime - startTime > TIMEOUT_USEC ) {
			hwMsg( 1, "waitForDmaCompletion -- Wait for GUI idle timed out\n" );
			break;
		}

		/* spin in place a bit so we aren't hammering the register */
		for ( i = 0 ; i < 10000 ; i++ ) {
			delay();
		}
	}

	hwMsg( 10, "waitForDmaCompletion, usec: %d\n", curTime - startTime );
	if ( guiStat & ENGINE_BUSY ) 
	{
		// We timed out on the wait for GUI idle, but did we complete
		// DMA normally anyhow?  If so, don't choke on this call...
		guiStat = SWAP( INREG( MACH64_BUS_CNTL ) );
		if ( !guiStat & BUS_MASTER_DIS )
		{
			/* dump the contents of some registers */
			mach64DumpEngineState();

			/* unlock the engine - we may be able to exit gracefully */
			mach64EngineReset();

			FatalError( "waitForDmaCompletion timed out, GUI_STAT=0x%08x\n", SWAP( INREG( MACH64_GUI_STAT ) ) );
		}
		hwMsg( 1, "waitForDmaCompletion -- GUI wait timed out, DMA shut down proper\n");	
	}

	if (mach64RegistersLocked)
	{
		// Free up access to the space and flag the DMA as not being active.
		mach64UnlockRegisters();
		mach64glx.dmaActive = 0;
	}
		
	return curTime - startTime;
}

/*
 * mach64DmaResetBuffer
 */
void mach64DmaResetBuffer()
{
	mach64glx.dma_buffer = dmaBuffers[ mach64glx.activeDmaBuffer ];
	mach64glx.dma_buffer->bufferDwords = 0;
}



#ifdef __PPC__

// Starting is reliable now -- operation isn't.  
// As was with the person before me, I'm playing with things to see what will and won't work...
//
//	FCE (01-27-01)

/* Data cache block flush - write out the cache line containing the
   specified address and then invalidate it in the cache. */
static __inline__ void dcbf(void *line)
{
	asm("dcbf %0,%1; sync" : : "r" (line), "r" (0));
}

// Ensure In-order Execution of I/O (eieio)
static inline void eieio(void)
{
	__asm__ __volatile__ ("eieio");
}


#else

#define	eieio()

#endif

#define DMA_CHUNKSIZE		0x1000
#define APERTURE_OFFSET		0x7ff800
/* This is the offset from Aperture Base to the start of
 * Memory Mapped Register Block 1 (at offset 8MB minus 2K).
 * 
 * MM Register Block 0 appears 1K higher, at Offset 8MB 
 * minus 1K. You will find that the register offsets in
 * mach64rgs.h are BYTE offsets (not DWORD indices) that
 * are relative to the start of MM Register Block 1. This
 * base virtual address is kept as mach64glx.MMIOBase
 */


static int VirtualToPhysical( void *v )
{
	int	page;

	page = ( (char *)v - mach64glx.dmaMemory ) >> 12;
	if ( page < 0 || page >= mach64glx.dmaSize * 0x100000 / 4096 ) {
		FatalError( "VirtualToPhysical: out of range" );
	}
	return mach64glx.memoryRemapping[ page ];
}

/*
 * mach64FlushRealDma
 *
 * Note that it apears that if you mess up a descriptor table, you need
 * to completely reboot to get the dma subsystem alive again.  Restarting
 * The X server will get the GUI registers back, but not DMA.
 */
void mach64FlushRealDma( void )
{
	hwUI32		*table_ptr;
	char		*page;
	int		tableDwords;
	int		i;
	int		pages;
	int		remainder;

	if ( mach64glx.skipDma ) {
		return;
	}

	pages = ( mach64glx.dma_buffer->bufferDwords * 4 + DMA_CHUNKSIZE - 1 ) / DMA_CHUNKSIZE;


	/* generate the descriptors for the full 4k chunks */
	table_ptr = (hwUI32 *)mach64glx.descriptorMemory;
	tableDwords = 0;
	for ( i = 0 ; i < pages-1 ; i++ ) {
		page = (char *)mach64glx.dma_buffer->virtualBuffer + i * DMA_CHUNKSIZE;

		table_ptr[DMA_FRAME_BUF_OFFSET] = SWAP( MACH64_BM_ADDR + APERTURE_OFFSET );
		table_ptr[DMA_SYS_MEM_ADDR] = SWAP( VirtualToPhysical( page ) );
		table_ptr[DMA_COMMAND] = SWAP( DMA_CHUNKSIZE | 0x40000000 );
		table_ptr[DMA_RESERVED] = 0;

		tableDwords += 4;
		table_ptr += 4;
	}

	/* generate the final descriptor for any remaining commands */
	page = (char *)mach64glx.dma_buffer->virtualBuffer + i * DMA_CHUNKSIZE;
	remainder = mach64glx.dma_buffer->bufferDwords * 4 - i * DMA_CHUNKSIZE;
	table_ptr[DMA_FRAME_BUF_OFFSET] = SWAP( MACH64_BM_ADDR + APERTURE_OFFSET );
	table_ptr[DMA_SYS_MEM_ADDR] = SWAP( VirtualToPhysical( page ) );
	table_ptr[DMA_COMMAND] = SWAP( remainder | 0x80000000 | 0x40000000 );
	table_ptr[DMA_RESERVED] = 0;

 	tableDwords += 4;

#ifdef MESA_DEBUG
	/* print info about descriptor table entries */
	hwMsg( 25, "  table at:0x%x  table entries:%d  buffer bytes:%d\n",
		   mach64glx.descriptorPhysical, tableDwords / 4, mach64glx.dma_buffer->bufferDwords * 4 );

	table_ptr = (hwUI32 *)mach64glx.descriptorMemory;
	for ( i = 0 ; i < tableDwords / 4 ; i++ ) {
		hwMsg( 25, "    entry: %x addr: %p cmd: 0x%x\n", i, SWAP(table_ptr[4*i+1]), SWAP(table_ptr[4*i+2]) );

		/* dump the contents of the buffers */
		if ( hwGetLogLevel() >= 255 ) {
			int	entries = (table_ptr[4*i+2] & 0xffff) / 4, j;

			for ( j = 0 ; j < entries ; j++ ) {
				hwMsg( 255, "        [0x%03x]: 0x%08x\n", j, SWAP(mach64glx.dma_buffer->virtualBuffer[1024*i+j]) );
			}
		}
	}
#endif

	/* make sure any write combining data is flushed */
	FlushWriteCombining();

	/* actually start the dma transfer */
	MACH64_WAITFREE();

	//eieio();
	OUTREG( MACH64_BUS_CNTL, SWAP( SWAP( INREG( MACH64_BUS_CNTL ) ) & ~BUS_MASTER_DIS ) );

	//eieio();
	OUTREG( MACH64_BM_GUI_TABLE_CMD, SWAP( mach64glx.descriptorPhysical | CIRCULAR_BUF_SIZE_16KB ) );

	//eieio();
	OUTREG( MACH64_SRC_CNTL,
		SWAP( SWAP( INREG( MACH64_SRC_CNTL ) ) |
		      SRC_BM_ENABLE | SRC_BM_SYNC | SRC_BM_OP_SYSTEM_TO_REG ) );

	/*
	 * To start the DMA transfer, we need to initiate a GUI operation.  We can
	 * write any value to the register, as it is only used to start the engine.
	 */
	//eieio();
	OUTREG( MACH64_DST_HEIGHT_WIDTH, 0 );
	//eieio();

	// Lock out anybody else from doing register accesses...
	mach64LockRegisters();	
}

/*
 * mach64ServerDmaFlush
 * Send all pending commands off to the hardware.
 * If we are running async, the hardware will be drawing
 * while we return to do other things.
 */
void mach64ServerDmaFlush( int wait )
{
	int		start, end;
	DMALOCALS;
	int		old;

	/* if we are a direct rendering client, message the server */
	if ( !__glx_is_server ) {
		mach64DirectClientDMAFlush( wait );
		return;
	}

	/* if the buffer is empty, just change in place */
	if ( !mach64glx.dma_buffer->bufferDwords ) {
		if ( wait ) {
			mach64WaitForDmaCompletion();
		}
		mach64DmaResetBuffer();
		return;
	}

	mach64glx.c_dmaFlush++;

	/* wait for the last buffer to complete */
	if ( !mach64WaitForDmaCompletion() ) {
		mach64glx.hardwareWentIdle = 1;
	} else {
		mach64glx.hardwareWentIdle = 0;
	}

	/* Add the commands at the end of the buffer to go back to
	 * drawing on the front buffer the way the X server expects.
	 */

	/* allow these to go into the overflow safety zone */
	old = mach64glx.dma_buffer->overflowBufferDwords;
	mach64glx.dma_buffer->overflowBufferDwords = mach64glx.dma_buffer->maxBufferDwords;

	DMAGETPTR( 60 );

	DMAOUTREG( MACH64_DST_OFF_PITCH, mach64glx.registers[MACH64_DST_OFF_PITCH>>2] );
	DMAOUTREG( MACH64_SRC_OFF_PITCH, mach64glx.registers[MACH64_SRC_OFF_PITCH>>2] );
	DMAOUTREG( MACH64_DP_SRC, mach64glx.registers[MACH64_DP_SRC>>2] );
	DMAOUTREG( MACH64_DP_MIX, mach64glx.registers[MACH64_DP_MIX>>2] );
	DMAOUTREG( MACH64_DP_FRGD_CLR, mach64glx.registers[MACH64_DP_FRGD_CLR>>2] );
	DMAOUTREG( MACH64_DP_WRITE_MASK, mach64glx.registers[MACH64_DP_WRITE_MASK>>2] );
	DMAOUTREG( MACH64_DP_PIX_WIDTH, mach64glx.registers[MACH64_DP_PIX_WIDTH>>2] );
	DMAOUTREG( MACH64_Z_CNTL, mach64glx.registers[MACH64_Z_CNTL>>2] );
	DMAOUTREG( MACH64_CLR_CMP_CNTL, mach64glx.registers[MACH64_CLR_CMP_CNTL>>2] );
	DMAOUTREG( MACH64_ALPHA_TST_CNTL, mach64glx.registers[MACH64_CLR_CMP_CNTL>>2] );
	DMAOUTREG( MACH64_GUI_TRAJ_CNTL, mach64glx.registers[MACH64_GUI_TRAJ_CNTL>>2] );
	DMAOUTREG( MACH64_SCALE_3D_CNTL, mach64glx.registers[MACH64_SCALE_3D_CNTL>>2] );
	DMAOUTREG( MACH64_SETUP_CNTL, mach64glx.registers[MACH64_SETUP_CNTL>>2] );
	/* can't use the composite registers, because they are write only and the save was wrong */
	DMAOUTREG( MACH64_SC_LEFT, mach64glx.registers[MACH64_SC_LEFT>>2] );
	DMAOUTREG( MACH64_SC_RIGHT, mach64glx.registers[MACH64_SC_RIGHT>>2] );
	DMAOUTREG( MACH64_SC_TOP, mach64glx.registers[MACH64_SC_TOP>>2] );
	DMAOUTREG( MACH64_SC_BOTTOM, mach64glx.registers[MACH64_SC_BOTTOM>>2] );
	/* these should terminate the dma, so they should be last */
	DMAOUTREG( MACH64_BUS_CNTL, mach64glx.registers[MACH64_BUS_CNTL>>2] );
	DMAOUTREG( MACH64_SRC_CNTL, mach64glx.registers[MACH64_SRC_CNTL>>2] );

	DMAADVANCE();

	mach64glx.dma_buffer->overflowBufferDwords = old;

	/* collect timing information if we are going syncronously */
	if ( mach64glx.dmaDriver < 2 ) {
		start = usec();
	} else {
		start = end = 0;
	}

	/* we will have to wait before doing any software rendering */
	mach64glx.dmaActive = 1;

	if ( mach64glx.dmaDriver == 0 ) {
		mach64FlushPseudoDma();
	} else {
		mach64FlushRealDma();
	}

	if ( ( mach64glx.dmaDriver == 1 ) || wait ) {
		/* wait until the dma completes */
		mach64WaitForDmaCompletion();
	}

	if ( mach64glx.dmaDriver < 2 ) {
		end = usec();
	}

	hwMsg( 9, "flushmode %i, buffer %i: dwords:%i  usec:%i\n",
		   mach64glx.dmaDriver,  mach64glx.activeDmaBuffer,
		   mach64glx.dma_buffer->bufferDwords, end - start );

	/* swap to using the other buffer */
	mach64glx.activeDmaBuffer ^= 1;

	mach64DmaResetBuffer();
}

/*
 * mach64DmaFlush
 */
void mach64DmaFlush( void )
{
	mach64ServerDmaFlush( 0 );
}

/*
 * mach64DmaFinish
 */
void mach64DmaFinish( void )
{
	/* note this for the performance block display
	 * and to notify the front buffer auto-swap
	 * that some software rendering was done.
	 */
	mach64glx.c_drawWaits++;

	/* get out fast if we know dma isn't running, because this gets
	   called for every software rendered scanline... */
	if ( !mach64glx.dmaActive && !mach64glx.dma_buffer->bufferDwords ) {
		return;
	}

	mach64ServerDmaFlush( 1 );
}


/*
 * mach64DmaOverflow
 * This is called when DMAGETPTR is at the end of the buffer
 */
void mach64DmaOverflow( int newDwords )
{
	hwMsg( 9, "mach64DmaOverflow(%i)\n", newDwords );

	/* flush all the current commands so we will have another
	   empty buffer */
	mach64DmaFlush();

	/* Overflow can happen anywhere, so normal update mechanisms
	 * aren't sufficient.
	 */
	if ( mach64Ctx ) {
		mach64DDUpdateState( mach64Ctx->gl_ctx );
	}

	mach64glx.c_overflows++;
	if ( newDwords > mach64glx.dma_buffer->overflowBufferDwords ) {
		FatalError("mach64DmaOverflow (%i) > overflowBufferDwords(%i)",
		newDwords, mach64glx.dma_buffer->overflowBufferDwords  );
	}
}


/*
 * mach64FlushPseudoDma
 * Hand feed a dma buffer to the card instead of using DMA
 * This isn't intended to be very efficient.
 */
static void mach64FlushPseudoDma( void )
{
	hwUI32		*src;
	int		i;
	int		count;
	int		reg;
	int		startTime, t;
	int		sequence, wait;
	static int junk;

	count = mach64glx.dma_buffer->bufferDwords;

	hwMsg( 20, "primary pseudoDma: %i dwords\n", count );

	mach64glx.hardwareWentIdle = 1;

	if ( mach64glx.skipDma ) 
	{
		return;
	}

	/* hand feed each register to the card */
	src = mach64glx.dma_buffer->virtualBuffer;
	for ( i = 0 ; i < count ;  )
	{
		reg = SWAP(src[i]);		/* back to bigendien if PPC */
		i++;

		sequence = reg >> 16;
		reg &= 0xffff;

		wait = sequence;
		
		/* wait for the fifo every 16 writes or every multi-reg write */
		if ( !(i & 15) ) 
		{
			startTime = usec();
			while ( SWAP( INREG( MACH64_GUI_STAT ) ) & ENGINE_BUSY ) 
			{
				t = usec();
				if ( t - startTime > 1000000 ) 
				{
					mach64DumpEngineState();
					mach64EngineReset();
					FatalError( "mach64FlushPseudoDma timed out before register write at dword %i of %i", i, count );
				}
			}
		}

		/* check for multi reg writes */
		reg = MMSELECT( reg );
		while ( sequence-- >= 0 ) 
		{
			hwMsg( 255, "    reg[%d] = 0x%03x val = 0x%08x\n", i, ADRINDEX( reg ), src[i] );
			OUTREG( reg, src[i] );
			reg += 4;
			i++;
		}

		/* wait for the fifo every 16 writes or every multi-reg write */
		if ( wait ) 
		{
			startTime = usec();
			while ( SWAP( INREG( MACH64_GUI_STAT ) ) & ENGINE_BUSY ) 
			{
				t = usec();
				if ( t - startTime > 1000000 ) 
				{
					mach64DumpEngineState();
					mach64EngineReset();
					FatalError( "mach64FlushPseudoDma timed out after register write at dword %i of %i", i, count );
				}
			}
		}
	}
}



/*
 * Local Variables:
 * mode: c
 * tab-width: 8
 * c-basic-offset: 8
 * End:
 */
