Saxum/extern/bullet/Extras/CDTestFramework/Opcode/Ice/IceUtils.h

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
 *	Contains misc. useful macros & defines.
 *	\file		IceUtils.h
 *	\author		Pierre Terdiman (personal code + collected from various sources)
 *	\date		April, 4, 2000
 */
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Include Guard
#ifndef ICEUTILS_H
#define ICEUTILS_H

	#define START_RUNONCE	{ static bool __RunOnce__ = false;	if(!__RunOnce__){
	#define END_RUNONCE		__RunOnce__ = true;}}

	//! Reverse all the bits in a 32 bit word (from Steve Baker's Cute Code Collection)
	//! (each line can be done in any order.
	inline_ void	ReverseBits(udword& n)
	{
		n = ((n >>  1) & 0x55555555) | ((n <<  1) & 0xaaaaaaaa);
		n = ((n >>  2) & 0x33333333) | ((n <<  2) & 0xcccccccc);
		n = ((n >>  4) & 0x0f0f0f0f) | ((n <<  4) & 0xf0f0f0f0);
		n = ((n >>  8) & 0x00ff00ff) | ((n <<  8) & 0xff00ff00);
		n = ((n >> 16) & 0x0000ffff) | ((n << 16) & 0xffff0000);
		// Etc for larger integers (64 bits in Java)
		// NOTE: the >> operation must be unsigned! (>>> in java)
	}

	//! Count the number of '1' bits in a 32 bit word (from Steve Baker's Cute Code Collection)
	inline_ udword	CountBits(udword n)
	{
		// This relies of the fact that the count of n bits can NOT overflow
		// an n bit integer. EG: 1 bit count takes a 1 bit integer, 2 bit counts
		// 2 bit integer, 3 bit count requires only a 2 bit integer.
		// So we add all bit pairs, then each nible, then each byte etc...
		n = (n & 0x55555555) + ((n & 0xaaaaaaaa) >> 1);
		n = (n & 0x33333333) + ((n & 0xcccccccc) >> 2);
		n = (n & 0x0f0f0f0f) + ((n & 0xf0f0f0f0) >> 4);
		n = (n & 0x00ff00ff) + ((n & 0xff00ff00) >> 8);
		n = (n & 0x0000ffff) + ((n & 0xffff0000) >> 16);
		// Etc for larger integers (64 bits in Java)
		// NOTE: the >> operation must be unsigned! (>>> in java)
		return n;
	}

	//! Even faster?
	inline_ udword	CountBits2(udword bits)
	{
		bits = bits - ((bits >> 1) & 0x55555555);
		bits = ((bits >> 2) & 0x33333333) + (bits & 0x33333333);
		bits = ((bits >> 4) + bits) & 0x0F0F0F0F;
		return (bits * 0x01010101) >> 24;
	}

	// "Population Count (Ones Count)
	// The population count of a binary integer value x is the number of one bits in the value. Although many machines have
	// single instructions for this, the single instructions are usually microcoded loops that test a bit per cycle; a log-time
	// algorithm coded in C is often faster. The following code uses a variable-precision SWAR algorithm to perform a tree
	// reduction adding the bits in a 32-bit value:"
	inline_ udword	ones32(udword x)
	{
		/* 32-bit recursive reduction using SWAR...
		but first step is mapping 2-bit values
		into sum of 2 1-bit values in sneaky way
		*/
		x -= ((x >> 1) & 0x55555555);
		x = (((x >> 2) & 0x33333333) + (x & 0x33333333));
		x = (((x >> 4) + x) & 0x0f0f0f0f);
		x += (x >> 8);
		x += (x >> 16);
		return (x & 0x0000003f);
		// "It is worthwhile noting that the SWAR population count algorithm given above can be improved upon for the case of
		// counting the population of multi-word bit sets. How? The last few steps in the reduction are using only a portion
		// of the SWAR width to produce their results; thus, it would be possible to combine these steps across multiple words
		// being reduced. One additional note: the AMD Athlon optimization guidelines suggest a very similar algorithm that
		// replaces the last three lines with return((x * 0x01010101) >> 24);. For the Athlon (which has a very fast integer
		// multiply), I would have expected AMD's code to be faster... but it is actually 6% slower according to my benchmarks
		// using a 1.2GHz Athlon (a Thunderbird). Why? Well, it so happens that GCC doesn't use a multiply instruction - it
		// writes out the equivalent shift and add sequence!"
	}

	// "Trailing Zero Count
	// Given the Least Significant 1 Bit and Population Count (Ones Count) algorithms, it is trivial to combine them to
	// construct a trailing zero count (as pointed-out by Joe Bowbeer):"
	inline_ udword	tzc(sdword x)
	{
		return(ones32((x & -x) - 1));
	}

	//! Spread out bits.	EG	00001111  ->   0101010101
	//! 						00001010  ->   0100010000
	//! This is used to interleave two integers to produce a `Morton Key'
	//! used in Space Filling Curves (See DrDobbs Journal, July 1999)
	//! Order is important.
	inline_ void	SpreadBits(udword& n)
	{
		n = ( n & 0x0000ffff) | (( n & 0xffff0000) << 16);
		n = ( n & 0x000000ff) | (( n & 0x0000ff00) <<  8);
		n = ( n & 0x000f000f) | (( n & 0x00f000f0) <<  4);
		n = ( n & 0x03030303) | (( n & 0x0c0c0c0c) <<  2);
		n = ( n & 0x11111111) | (( n & 0x22222222) <<  1);
	}

	// "Next Largest Power of 2
	// Given a binary integer value x, the next largest power of 2 can be computed by a SWAR algorithm
	// that recursively "folds" the upper bits into the lower bits. This process yields a bit vector with
	// the same most significant 1 as x, but all 1's below it. Adding 1 to that value yields the next
	// largest power of 2. For a 32-bit value:"
	inline_ udword	NextPowerOfTwo(udword x)
	{
		x |= (x >> 1);
		x |= (x >> 2);
		x |= (x >> 4);
		x |= (x >> 8);
		x |= (x >> 16);
		return x+1;
	}

	//! Test to see if a number is an exact power of two (from Steve Baker's Cute Code Collection)
	inline_ bool	IsPowerOfTwo(udword n)				{ return ((n&(n-1))==0);					}

	//! Zero the least significant '1' bit in a word. (from Steve Baker's Cute Code Collection)
	inline_ void	ZeroLeastSetBit(udword& n)			{ n&=(n-1);									}

	//! Set the least significant N bits in a word. (from Steve Baker's Cute Code Collection)
	inline_ void	SetLeastNBits(udword& x, udword n)	{ x|=~(~0<<n);								}

	//! Classic XOR swap (from Steve Baker's Cute Code Collection)
	//! x ^= y;		/* x' = (x^y) */
	//! y ^= x;		/* y' = (y^(x^y)) = x */
	//! x ^= y;		/* x' = (x^y)^x = y */
	inline_ void	Swap(udword& x, udword& y)			{ x ^= y; y ^= x; x ^= y;					}

	//! Little/Big endian (from Steve Baker's Cute Code Collection)
	//!
	//! Extra comments by Kenny Hoff:
	//! Determines the byte-ordering of the current machine (little or big endian)
	//! by setting an integer value to 1 (so least significant bit is now 1); take
	//! the address of the int and cast to a byte pointer (treat integer as an
	//! array of four bytes); check the value of the first byte (must be 0 or 1).
	//! If the value is 1, then the first byte least significant byte and this
	//! implies LITTLE endian. If the value is 0, the first byte is the most
	//! significant byte, BIG endian. Examples:
	//!      integer 1 on BIG endian: 00000000 00000000 00000000 00000001
	//!   integer 1 on LITTLE endian: 00000001 00000000 00000000 00000000
	//!---------------------------------------------------------------------------
	//! int IsLittleEndian()	{ int x=1;	return ( ((char*)(&x))[0] );	}
	inline_ char	LittleEndian()						{ int i = 1; return *((char*)&i);			}

	//!< Alternative abs function
	inline_ udword	abs_(sdword x)						{ sdword y= x >> 31;	return (x^y)-y;		}

	// "Integer Minimum or Maximum
	// Given 2's complement integer values x and y, the minimum can be computed without any branches as
	// x+(((y-x)>>(WORDBITS-1))&(y-x)).
	// Logically, this works because the shift by (WORDBITS-1) replicates the sign bit to create a mask
	// -- be aware, however, that the C language does not require that shifts are signed even if their
	// operands are signed, so there is a potential portability problem. Additionally, one might think
	// that a shift by any number greater than or equal to WORDBITS would have the same effect, but many
	// instruction sets have shifts that behave strangely when such shift distances are specified.
	// Of course, maximum can be computed using the same trick:
	// x-(((x-y)>>(WORDBITS-1))&(x-y))."

	//!< Alternative min function
	inline_ sdword	min_(sdword a, sdword b)			{ sdword delta = b-a;	return a + (delta&(delta>>31));	}
	//!< Alternative max function
	inline_ sdword	max_(sdword a, sdword b)			{ sdword delta = a-b;	return a - (delta&(delta>>31));	}

	// "Integer Selection
	// A branchless, lookup-free, alternative to code like if (a<b) x=c; else x=d; is ((((a-b) >> (WORDBITS-1)) & (c^d)) ^ d).
	// This code assumes that the shift is signed, which, of course, C does not promise."
	inline_ sdword IntegerSelection(sdword a, sdword b, sdword c, sdword d)
	{
		return ((((a-b)>>31) & (c^d)) ^ d);
	}

	// Determine if one of the bytes in a 4 byte word is zero
	inline_	BOOL	HasNullByte(udword x)				{ return ((x + 0xfefefeff) & (~x) & 0x80808080);		}

	// To find the smallest 1 bit in a word  EG: ~~~~~~10---0    =>    0----010---0
	inline_	udword	LowestOneBit(udword w)				{ return ((w) & (~(w)+1));								}
//	inline_	udword	LowestOneBit_(udword w)				{ return ((w) & (-(w)));								}

	// "Most Significant 1 Bit
	// Given a binary integer value x, the most significant 1 bit (highest numbered element of a bit set)
	// can be computed using a SWAR algorithm that recursively "folds" the upper bits into the lower bits.
	// This process yields a bit vector with the same most significant 1 as x, but all 1's below it.
	 // Bitwise AND of the original value with the complement of the "folded" value shifted down by one
	// yields the most significant bit. For a 32-bit value:"
	inline_ udword	msb32(udword x)
	{
		x |= (x >> 1);
		x |= (x >> 2);
		x |= (x >> 4);
		x |= (x >> 8);
		x |= (x >> 16);
		return (x & ~(x >> 1));
	}

	// "Gray Code Conversion
	// A Gray code is any binary coding sequence in which only a single bit position changes as we move from one value to the next.
	// There are many such codes, but the traditional one is computed such that the Kth Gray code is K^(K>>1).
	//
	// The well-known algorithm for conversion from Gray to binary is a linear sequence of XORs that makes it seem each bit must be
	// dealt with separately. Fortunately, that is equivalent to a parallel prefix XOR that can be computed using SWAR techniques
	// in log time. For 32-bit Gray code values produced as described above, the conversion from Gray code back to unsigned binary is:"
	inline_ udword g2b(udword gray)
	{
		gray ^= (gray >> 16);
		gray ^= (gray >> 8);
		gray ^= (gray >> 4);
		gray ^= (gray >> 2);
		gray ^= (gray >> 1);
		return gray;
	}

	/*
	"Just call it repeatedly with various input values and always with the same variable as "memory".
	The sharpness determines the degree of filtering, where 0 completely filters out the input, and 1
	does no filtering at all.

	I seem to recall from college that this is called an IIR (Infinite Impulse Response) filter. As opposed
	to the more typical FIR (Finite Impulse Response).

	Also, I'd say that you can make more intelligent and interesting filters than this, for example filters
	that remove wrong responses from the mouse because it's being moved too fast. You'd want such a filter
	to be applied before this one, of course."

	(JCAB on Flipcode)
	*/
	inline_ float	FeedbackFilter(float val, float& memory, float sharpness)
	{
		ASSERT(sharpness>=0.0f && sharpness<=1.0f && "Invalid sharpness value in feedback filter");
				if(sharpness<0.0f)	sharpness = 0.0f;
		else	if(sharpness>1.0f)	sharpness = 1.0f;
		return memory = val * sharpness + memory * (1.0f - sharpness);
	}

	//! "If you can guarantee that your input domain (i.e. value of x) is slightly
	//! limited (abs(x) must be < ((1<<31u)-32767)), then you can use the
	//! following code to clamp the resulting value into [-32768,+32767] range:"
	inline_ int	ClampToInt16(int x)
	{
//		ASSERT(abs(x) < (int)((1<<31u)-32767));

		int delta = 32767 - x;
		x += (delta>>31) & delta;
		delta = x + 32768;
		x -= (delta>>31) & delta;
		return x;
	}

	// Generic functions
	template<class Type> inline_ void TSwap(Type& a, Type& b)								{ const Type c = a; a = b; b = c;			}
	template<class Type> inline_ Type TClamp(const Type& x, const Type& lo, const Type& hi)	{ return ((x<lo) ? lo : (x>hi) ? hi : x);	}

	template<class Type> inline_ void TSort(Type& a, Type& b)
	{
		if(a>b)	TSwap(a, b);
	}

	template<class Type> inline_ void TSort(Type& a, Type& b, Type& c)
	{
		if(a>b)	TSwap(a, b);
		if(b>c)	TSwap(b, c);
		if(a>b)	TSwap(a, b);
		if(b>c)	TSwap(b, c);
	}

	// Prevent nasty user-manipulations (strategy borrowed from Charles Bloom)
//	#define PREVENT_COPY(curclass)	void operator = (const curclass& object)	{	ASSERT(!"Bad use of operator =");	}
	// ... actually this is better !
	#define PREVENT_COPY(cur_class)	private: cur_class(const cur_class& object);	cur_class& operator=(const cur_class& object);

	//! TO BE DOCUMENTED
	#define OFFSET_OF(Class, Member)	(size_t)&(((Class*)0)->Member)

	//! TO BE DOCUMENTED
	#if !defined(_XBOX)
		// Already defined on Xbox.
		#define ARRAYSIZE(p)			(sizeof(p)/sizeof(p[0]))
	#endif

	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	/**
	 *	Returns the alignment of the input address.
	 *	\fn			Alignment()
	 *	\param		address	[in] address to check
	 *	\return		the best alignment (e.g. 1 for odd addresses, etc)
	 */
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	FUNCTION ICECORE_API udword Alignment(udword address);

	#define IS_ALIGNED_2(x)		((x&1)==0)
	#define IS_ALIGNED_4(x)		((x&3)==0)
	#define IS_ALIGNED_8(x)		((x&7)==0)

	// Updates a pointer with "stride" bytes
	inline_	void UpdatePtr(void*& ptr, udword stride)	{ ptr = ((ubyte*)ptr) + stride;			}

	// From Jon Watte IIRC
	inline_ void _prefetch(void const* ptr)				{ (void)*(char const volatile *)ptr;	}

	// Compute implicit coords from an index:
	// The idea is to get back 2D coords from a 1D index.
	// For example:
	//
	// 0		1		2	...	nbu-1
	// nbu		nbu+1	i	...
	//
	// We have i, we're looking for the equivalent (u=2, v=1) location.
	//		i = u + v*nbu
	// <=>	i/nbu = u/nbu + v
	// Since 0 <= u < nbu, u/nbu = 0 (integer)
	// Hence: v = i/nbu
	// Then we simply put it back in the original equation to compute u = i - v*nbu
	inline_ void Compute2DCoords(udword& u, udword& v, udword i, udword nbu)
	{
		v = i / nbu;
		u = i - (v * nbu);
	}

	// In 3D:	i = u + v*nbu + w*nbu*nbv
	// <=>		i/(nbu*nbv) = u/(nbu*nbv) + v/nbv + w
	// u/(nbu*nbv) is null since u/nbu was null already.
	// v/nbv is null as well for the same reason.
	// Hence w = i/(nbu*nbv)
	// Then we're left with a 2D problem: i' = i - w*nbu*nbv = u + v*nbu
	inline_ void Compute3DCoords(udword& u, udword& v, udword& w, udword i, udword nbu, udword nbu_nbv)
	{
		w = i / (nbu_nbv);
		Compute2DCoords(u, v, i - (w * nbu_nbv), nbu);
	}

	// Calling fsincos instead of fsin+fcos. Twice faster.
	inline_	void FSinCos(float& c, float& s, float f)
	{
		float LocalCos, LocalSin;
		float Local = f;
#ifdef WIN32
		_asm	fld		Local
		_asm	fsincos
		_asm	fstp	LocalCos
		_asm	fstp	LocalSin
#elif LINUX
		asm("fld	Local\n\t"
			"fsincos\n\t"
			"fstp	LocalCos\n\t"
			"fstp	LocalSin\n\t"
		);
#endif
		c = LocalCos;
		s = LocalSin;
	}

	// Modulo3 macros. See http://www.codercorner.com/Modulo3.htm
	#define GET_NEXT_INDICES(i, j, k)	\
		k = 0x01000201;					\
		k>>=(i<<3);						\
		j = k & 0xff;					\
		k>>=8;							\
		k&=0xff;

	#define GET_NEXT_INDICES2(i, j, k)	\
		j = ( 9 >> (i<<1)) & 3;			\
		k = (18 >> (i<<1)) & 3;

	// 0=>1, 1=>2, 2=>0
	inline_	udword Modulo3(udword i)
	{
		ASSERT(i==0 || i==1 || i==2);
		return (9 >> (i << 1)) & 3;
	}

#endif // ICEUTILS_H