summary refs log tree commit diff stats
path: root/lib/atomic.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/atomic.h')
-rw-r--r--lib/atomic.h2716
1 files changed, 2716 insertions, 0 deletions
diff --git a/lib/atomic.h b/lib/atomic.h
new file mode 100644
index 000000000..afab4843f
--- /dev/null
+++ b/lib/atomic.h
@@ -0,0 +1,2716 @@
+/* Atomic operations for Nimrod */
+
+#if defined(_MSCVER)
+__declspec(naked) int __fastcall Xadd (volatile int* pNum, int val)
+{
+    __asm
+    {
+        lock xadd dword ptr [ECX], EDX
+        mov EAX, EDX
+        ret
+    }
+}
+
+
+
+#endif
+
+
+#define ATOMIC_ASM(type,op)     \
+    __asm __volatile ("lock; " op : "=m" (*(type *)p) : "ir" (v), "0" (*(type *)p))
+
+#define ATOMIC_ASM_NOLOCK(type,op)     \
+    __asm __volatile (op : "=m" (*(type *)p) : "ir" (v), "0" (*(type *)p))
+
+static __inline void
+atomic_add_int(void *p, u_int v)
+{
+        ATOMIC_ASM(int, "addl %1,%0");
+}
+
+static __inline void
+atomic_add_int_nolock(void *p, u_int v)
+{
+        ATOMIC_ASM_NOLOCK(int, "addl %1,%0");
+}
+
+
+
+/*
+Atomic.h
+
+Joshua Scholar
+May 26, 2003
+
+This header contains:
+
+a multiprocessor nonblocking FIFO,  
+
+a multiprocessor nonblocking LIFO
+
+multiprocessor nonblocking reference counting (including volatile pointers 
+that can be safely shared between processors)
+
+nonblocking memory allocation routines
+
+template types that encapsulate variables meant to be shared between 
+processors - all changes to these variables are atomic and globally visible
+
+All kinds of atomic operations that are useful in a multiprocessor context.
+
+The philosophy behind this code is that I created templates that encapsulate
+atomic access so that while the templates themselves may not be the easiest
+code to read, code that uses these templates can be simple, abstract and
+reliable.
+
+I also created regular C style functions, overloaded by type for some of
+the more basic operations.  If you have regular variables or memory 
+locations that you want to use in a globally visible way you have two 
+choices.
+
+If the operation you want is one of the basic building blocks you can
+call one of the overloaded functions like InterlockedSetIfEqual().
+
+Otherwise it's perfectly safe to cast a pointer to your data to be a pointer
+to one of the atomic types so that you can use their methods.  For instance:
+if (((AtomicInt *)foo)->CompareSomeBitsAndExchangeSomeOtherBits(exchange, 
+        bitsToExchange,
+        comperand, 
+        bitsToCompare))
+    ...
+or even
+
+//atomically allocate n bytes out of the pool
+ data = ((*(AtomicPtr<char> *)curPool)+= n) - n;
+
+
+
+State of code:
+
+Unlike other libraries of similar routines that I wrote in the past, this
+has not been thoroughly tested. In fact I don't remember how much of it has
+been tested at this point.
+
+It would take an 8 way machine for me to really pound on the routines.
+
+
+
+Overview
+
+Some basic types are:
+typedef Atomic<int> AtomicInt;
+typedef Atomic<unsigned int> AtomicUInt;
+typedef Atomic<__int64> AtomicInt64;
+typedef Atomic<unsigned __int64> AtomicUInt64;
+
+Fancier types include
+template <typename T> struct AtomicPtr;
+This is a pointer that has the same semantics as the above integer types
+
+template <typename T>struct AtomicPtrWithCount;
+
+AtomicPtrWithCount<T> is a very important type.  It has 32 bits of pointer
+and 32 bits of counter.  There are a number of algorithms that are possible
+when a pointer and a counter can be changed atomically together including a
+lock free pushdown stack and reference counted garbage collection where
+multiple processors can share pointers as well as sharing the data that's
+pointed at.
+
+template <typename T> struct AtomicPtrWithMark;
+
+This is similar to AtomicPtrWithCount<T> but the extra 32 bits are accessed
+as individual bits instead of being accessed as a counter.  It's not as
+important.  I was playing with algorithms before I realized that the
+important ones I was afraid of using had been published independently of my
+former employer.
+
+
+
+The atomic number types act as integer variables, but all changes to these
+variable happen through interlocked atomic instructions.
+All changes are therefor "globally visible" in Intel's parlance.
+
+Note that incrementing (or decrementing or adding to) one of these uses
+InterlockedExchangeAdd, which for 64 bit numbers ends up relying on "lock
+CMPXCHG8B"
+
+There's an Exchange method.
+
+There are also special methods that use compare exchange in some forms that
+I've found useful:
+
+T CompareExchange(T exchange, T comperand)
+bool SetIfEqual(T exchange, T comperand)
+
+and fancier ones I found some uses for
+
+ inline bool SetIfSomeBitsAreEqual(T exchange, T comperand, T bitsToCompare)
+ inline bool SetSomeBitsIfThoseBitsAreEqual(T exchange, 
+                                             T comperand, 
+                                             T bitsToCompare)
+ inline bool SetSomeBitsIfSomeOtherBitsAreEqual(T exchange,
+              T bitsToExchange,
+              T comperand,
+              T bitsToCompare
+              )
+
+ inline T CompareSomeBitsAndExchange(T exchange, 
+                                     T comperand, 
+                                     T bitsToCompare)
+ inline T CompareSomeBitsAndExchangeThoseBits(T exchange, 
+                                             T comperand, 
+                                             T bitsToCompare)
+ inline T CompareSomeBitsAndExchangeSomeOtherBits(T exchange,
+              T bitsToExchange,
+              T comperand,
+              T bitsToCompare
+              )
+
+There are also atomic bit test, bit test and set etc. methods:
+
+ inline bool BTS(int bit)
+ inline bool BTC(int bit)
+ inline bool BTR(int bit)
+ inline bool BT(int bit)
+
+ALGORITHMS and their classes:
+
+The important ones are:
+
+struct Counted
+Use this as the base type for any object you want to be reference counted
+
+template <typename T> class CountedPtr;
+Safely acts as pointer to a reference counted type.  This pointer can not be
+shared between threads safely.
+
+template <typename T> class AtomicCountedPtr;
+Like CountedPtr<T> but this pointer CAN be shared between threads/processors
+safely.
+
+template <typename T> class MPQueue;
+Multiprocessor queue.  This is the nonblocking shared FIFO.
+
+Note, for the sake of convenience there is a Fifo that has the same
+semantics but can only be used single threaded:
+template <typename T> class Queue ;
+
+class MPCountStack;
+This is the multiprocessor nonblocking LIFO stack.  Note that what gets
+pushed on the stack are pointers to MPStackElement. Your data must be
+objects derived from MPStackElements.  Note that it is not legal to push a
+NULL onto the stack - NULL is used to signal an empty stack.
+
+Note that for the sake of convienience there is, once again, a single
+threaded version of the stack:
+class SimpleStack.
+
+There are also classes for allocators that use the MPCountStack as a
+freelist.
+template <typename T, typename BLOCK_ALLOCATOR>
+struct MPCountFreeListAllocator;
+
+This template is recursive in the sense that each allocator gets new blocks
+from a shared allocatorn passed in the constructor (of type
+BLOCK_ALLOCATOR).  You can build a tree of allocators this way.  The root of
+the tree should be of type SimpleAllocator<T> which just calls new and
+delete.
+
+Once again, for the sake of simplicity, there is a single threaded version
+of the block allocator called
+template <typename T, typename BLOCK_ALLOCATOR> struct SimpleBlockAllocator
+
+*/
+#ifndef ATOMIC_H
+#define ATOMIC_H
+#include <windows.h>
+#include <assert.h>
+#include <new>
+using namespace std;
+/* 
+    windows defines the following interlocked routines
+    we need to define the equivalent for volatiles (which 
+    they should have been in the first place, and the following 
+    types voltatile long, volatile int, volatile unsigned long, 
+    volatile unsigned int,  volatile T*, volatile __int64, 
+    volatile unsigned __int64.
+
+    Note: I use the platform SDK which has different header files 
+    for interlocked instructions than the Windows includes in Visual 
+    C
+
+    If you have the platform SDK and the code doesn't compile then
+    you need to make sure that 
+    "C:\Program Files\Microsoft Platform SDK\include"
+    is the first directory listed under menus "Tools" -> menu item
+    "Options" -> tab "Directories" (of course if you installed the 
+    platform SDK in a different directory than 
+    "C:\Program Files\Microsoft Platform SDK" then you should use 
+    YOUR path.
+
+    If you don't have the plaform SDK then InterlockedCompareExchange 
+    is defined for void * instead of being defined for longs...  and 
+    there is no InterlockedCompareExchangePointer
+
+    The whole point of Microsoft having different headers was an update
+    to support 64 bit platforms which doesn't matter here at all (some 
+    of the code here relies on CMPXCHG8B swaping out both a pointer AND
+    a counter - a trick that won't work on the current 64 bit platforms).
+
+    In any case, if you don't have the platform SDK then just 
+    appropriate casts to make the code compile.  Keep in mind that 
+    casting from 64 bit types to 32 bit types is wrong - where there's 
+    a 64 bit type I meant it to call one of my assembly language
+    routines that uses CMPXCHG8B.
+
+    LONG
+    InterlockedIncrement(
+    LPLONG lpAddend
+    );
+
+    LONG
+    InterlockedDecrement(
+    LPLONG lpAddend
+    );
+
+    LONG
+    InterlockedExchange(
+    LPLONG Target,
+    LONG Value
+    );
+
+    LONG
+    InterlockedExchangeAdd(
+    LPLONG Addend,
+    LONG Value
+    );
+
+    LONG
+    InterlockedCompareExchange (
+    PLONG Destination,
+    LONG ExChange,
+    LONG Comperand
+    );
+
+    PVOID
+    InterlockedExchangePointer (
+    PVOID *Target,
+    PVOID Value
+    );
+
+    PVOID
+    InterlockedCompareExchangePointer (
+    PVOID *Destination,
+    PVOID ExChange,
+    PVOID Comperand
+    );
+*/
+
+//we'll need a special cases for volatile __int64 and 
+//volatile unsigned __int64
+
+template <typename T>
+inline T InterlockedIncrement(volatile T * ptr)
+{
+    return (T)InterlockedIncrement((LPLONG)ptr);
+}
+
+template <typename T>
+inline T InterlockedDecrement(volatile T * ptr)
+{
+    return (T)InterlockedDecrement((LPLONG)ptr);
+}
+
+template <typename T>
+inline T InterlockedExchange(volatile T * target,T value)
+{
+    return (T)InterlockedExchange((LPLONG)target,(LONG)value);
+}
+
+template <typename T>
+inline T InterlockedExchangeAdd(volatile T *addend,T value)
+{
+    return (T)InterlockedExchangeAdd((LPLONG)addend,(LONG)value);
+}
+
+template <typename T>
+T InterlockedCompareExchange (volatile T * dest,T exchange,T comperand)
+{
+    return (T)InterlockedCompareExchange ((LPLONG)dest,
+                                        (LONG)exchange,
+                                        (LONG)comperand);
+}
+//most common use of InterlockedCompareExchange
+template <typename T>
+bool InterlockedSetIfEqual (volatile T * dest,T exchange,T comperand)
+{
+    return comperand==InterlockedCompareExchange(dest,exchange,comperand);
+}
+
+//disable the no return value warning, because the assembly language
+//routines load the appropriate registers directly
+#pragma warning(disable:4035)
+
+inline unsigned __int64 
+InterlockedCompareExchange(volatile unsigned __int64 *dest
+                           ,unsigned __int64 exchange
+                           ,unsigned __int64 comperand) 
+{
+    //value returned in eax::edx
+    __asm {
+        lea esi,comperand;
+        lea edi,exchange;
+        
+        mov eax,[esi];
+        mov edx,4[esi];
+        mov ebx,[edi];
+        mov ecx,4[edi];
+        mov esi,dest;
+        //lock CMPXCHG8B [esi] is equivalent to the following except
+        //that it's atomic:
+        //ZeroFlag = (edx:eax == *esi);
+        //if (ZeroFlag) *esi = ecx:ebx;
+        //else edx:eax = *esi;
+        lock CMPXCHG8B [esi];			
+    }
+}
+
+//most common use of InterlockedCompareExchange
+//It's more efficient to use the z flag than to do another compare
+inline bool 
+InterlockedSetIfEqual(volatile unsigned __int64 *dest
+                      ,unsigned __int64 exchange
+                      ,unsigned __int64 comperand) 
+{
+    //value returned in eax
+    __asm {
+        lea esi,comperand;
+        lea edi,exchange;
+        
+        mov eax,[esi];
+        mov edx,4[esi];
+        mov ebx,[edi];
+        mov ecx,4[edi];
+        mov esi,dest;
+        //lock CMPXCHG8B [esi] is equivalent to the following except
+        //that it's atomic:
+        //ZeroFlag = (edx:eax == *esi);
+        //if (ZeroFlag) *esi = ecx:ebx;
+        //else edx:eax = *esi;
+        lock CMPXCHG8B [esi];			
+        mov eax,0;
+        setz al;
+    }
+}
+#pragma warning(default:4035)
+
+inline unsigned __int64 InterlockedIncrement(volatile unsigned __int64 * ptr)
+{
+    unsigned __int64 comperand;
+    unsigned __int64 exchange;
+    do {
+        comperand = *ptr;
+        exchange = comperand+1;
+    }while(!InterlockedSetIfEqual(ptr,exchange,comperand));
+    return exchange;
+}
+
+inline unsigned __int64 InterlockedDecrement(volatile unsigned __int64 * ptr)
+{
+    unsigned __int64 comperand;
+    unsigned __int64 exchange;
+    do {
+        comperand = *ptr;
+        exchange = comperand-1;
+    }while(!InterlockedSetIfEqual(ptr,exchange,comperand));
+    return exchange;
+}
+
+inline unsigned __int64 InterlockedExchange(volatile unsigned __int64 * target,
+                                            unsigned __int64 value)
+{
+    unsigned __int64 comperand;
+    do {
+        comperand = *target;
+    }while(!InterlockedSetIfEqual(target,value,comperand));
+    return comperand;
+}
+
+inline unsigned __int64 InterlockedExchangeAdd(volatile unsigned __int64 *addend,
+                                               unsigned __int64 value)
+{
+    unsigned __int64 comperand;
+    do {
+        comperand = *addend;
+    }while(!InterlockedSetIfEqual(addend,comperand+value,comperand));
+    return comperand;
+}
+
+#pragma warning(disable:4035)
+inline __int64 
+InterlockedCompareExchange(volatile __int64 *dest
+                           ,__int64 exchange
+                           ,__int64 comperand) 
+{
+    //value returned in eax::edx
+    __asm {
+        lea esi,comperand;
+        lea edi,exchange;
+        
+        mov eax,[esi];
+        mov edx,4[esi];
+        mov ebx,[edi];
+        mov ecx,4[edi];
+        mov esi,dest;
+        //lock CMPXCHG8B [esi] is equivalent to the following except
+        //that it's atomic:
+        //ZeroFlag = (edx:eax == *esi);
+        //if (ZeroFlag) *esi = ecx:ebx;
+        //else edx:eax = *esi;
+        lock CMPXCHG8B [esi];			
+    }
+}
+
+//most common use of InterlockedCompareExchange
+//It's more efficient to use the z flag than to do another compare
+inline bool 
+InterlockedSetIfEqual(volatile __int64 *dest
+                      ,__int64 exchange
+                      ,__int64 comperand) 
+{
+    //value returned in eax
+    __asm {
+        lea esi,comperand;
+        lea edi,exchange;
+        
+        mov eax,[esi];
+        mov edx,4[esi];
+        mov ebx,[edi];
+        mov ecx,4[edi];
+        mov esi,dest;
+        //lock CMPXCHG8B [esi] is equivalent to the following except
+        //that it's atomic:
+        //ZeroFlag = (edx:eax == *esi);
+        //if (ZeroFlag) *esi = ecx:ebx;
+        //else edx:eax = *esi;
+        lock CMPXCHG8B [esi];			
+        mov eax,0;
+        setz al;
+    }
+}
+#pragma warning(default:4035)
+
+inline __int64 InterlockedIncrement(volatile __int64 * dest)
+{
+    __int64 comperand;
+    __int64 exchange;
+    do {
+        comperand = *dest;
+        exchange = comperand+1;
+    }while(!InterlockedSetIfEqual(dest,exchange,comperand));
+    return exchange;
+}
+
+inline __int64 InterlockedDecrement(volatile __int64 * dest)
+{
+    __int64 comperand;
+    __int64 exchange;
+    do {
+        comperand = *dest;
+        exchange = comperand-1;
+    }while(!InterlockedSetIfEqual(dest,exchange,comperand));
+    return exchange;
+}
+
+inline __int64 InterlockedExchange(volatile __int64 * target,__int64 value)
+{
+    __int64 comperand;
+    do {
+        comperand = *target;
+    }while(!InterlockedSetIfEqual(target,value,comperand));
+    return comperand;
+}
+
+inline __int64 InterlockedExchangeAdd(volatile __int64 *addend,
+                                      __int64 value)
+{
+    __int64 comperand;
+    do {
+        comperand = *addend;
+    }while(!InterlockedSetIfEqual(addend,comperand+value,comperand));
+    return comperand;
+}
+
+#pragma warning(disable:4035)
+//I've just thought of some algorithms that use BTS and all so I'm including them
+inline bool InterlockedBTS(volatile int *dest, int bit)
+{
+    //value returned in eax
+    __asm{
+        mov eax,bit;
+        mov ebx,dest;
+        lock bts [ebx],eax;
+        mov eax,0;
+        setc al;
+    }
+}
+inline bool InterlockedBTC(volatile int *dest, int bit)
+{
+    //value returned in eax
+    __asm{
+        mov eax,bit;
+        mov ebx,dest;
+        lock btc [ebx],eax;
+        mov eax,0;
+        setc al;
+    }
+}
+inline bool InterlockedBTR(volatile int *dest, int bit)
+{
+    //value returned in eax
+    __asm{
+        mov eax,bit;
+        mov ebx,dest;
+        lock btr [ebx],eax;
+        mov eax,0;
+        setc al;
+    }
+}
+//you can lock BT but since it doesn't change memory there isn't really any point
+inline bool BT(volatile int *dest, int bit)
+{
+    //value returned in eax
+    __asm{
+        mov eax,bit;
+        mov ebx,dest;
+        bt [ebx],eax;
+        mov eax,0;
+        setc al;
+    }
+}
+#pragma warning(default:4035)
+
+inline bool InterlockedBTS(volatile unsigned *dest, int bit)
+{
+    return InterlockedBTS((volatile int *)dest,bit);
+}
+inline bool InterlockedBTC(volatile unsigned *dest, int bit)
+{
+    return InterlockedBTC((volatile int *)dest,bit);
+}
+inline bool InterlockedBTR(volatile unsigned *dest, int bit)
+{
+    return InterlockedBTR((volatile int *)dest,bit);
+}
+inline bool BT(volatile unsigned *dest, int bit)
+{
+    return BT((volatile int *)dest,bit);
+}
+
+inline bool InterlockedBTS(volatile unsigned long *dest, int bit)
+{
+    return InterlockedBTS((volatile int *)dest,bit);
+}
+inline bool InterlockedBTC(volatile unsigned long *dest, int bit)
+{
+    return InterlockedBTC((volatile int *)dest,bit);
+}
+inline bool InterlockedBTR(volatile unsigned long *dest, int bit)
+{
+    return InterlockedBTR((volatile int *)dest,bit);
+}
+inline bool BT(volatile unsigned long *dest, int bit)
+{
+    return BT((volatile int *)dest,bit);
+}
+
+inline bool InterlockedBTS(volatile long *dest, int bit)
+{
+    return InterlockedBTS((volatile int *)dest,bit);
+}
+inline bool InterlockedBTC(volatile long *dest, int bit)
+{
+    return InterlockedBTC((volatile int *)dest,bit);
+}
+inline bool InterlockedBTR(volatile long *dest, int bit)
+{
+    return InterlockedBTR((volatile int *)dest,bit);
+}
+inline bool BT(volatile long *dest, int bit)
+{
+    return BT((volatile int *)dest,bit);
+}
+
+inline bool InterlockedBTS(volatile __int64 *dest, int bit)
+{
+    if (bit<32) return InterlockedBTS((volatile int *)&dest,bit);
+    return InterlockedBTS(1+(volatile int *)&dest,bit-32);
+}
+inline bool InterlockedBTC(volatile __int64 *dest, int bit)
+{
+    if (bit<32) return InterlockedBTC((volatile int *)&dest,bit);
+    return InterlockedBTC(1+(volatile int *)&dest,bit-32);
+}
+inline bool InterlockedBTR(volatile __int64 *dest, int bit)
+{
+    if (bit<32) return InterlockedBTR((volatile int *)&dest,bit);
+    return InterlockedBTR(1+(volatile int *)&dest,bit-32);
+}
+inline bool BT(volatile __int64 *dest, int bit)
+{
+    if (bit<32) return BT((volatile int *)&dest,bit);
+    return BT(1+(volatile int *)&dest,bit-32);
+}
+inline bool InterlockedBTS(volatile unsigned __int64 *dest, int bit)
+{
+    if (bit<32) return InterlockedBTS((volatile int *)&dest,bit);
+    return InterlockedBTS(1+(volatile int *)&dest,bit-32);
+}
+inline bool InterlockedBTC(volatile unsigned __int64 *dest, int bit)
+{
+    if (bit<32) return InterlockedBTC((volatile int *)&dest,bit);
+    return InterlockedBTC(1+(volatile int *)&dest,bit-32);
+}
+inline bool InterlockedBTR(volatile unsigned __int64 *dest, int bit)
+{
+    if (bit<32) return InterlockedBTR((volatile int *)&dest,bit);
+    return InterlockedBTR(1+(volatile int *)&dest,bit-32);
+}
+inline bool BT(volatile unsigned __int64 *dest, int bit)
+{
+    if (bit<32) return BT((volatile int *)&dest,bit);
+    return BT(1+(volatile int *)&dest,bit-32);
+}
+
+//T can be int, unsigned int, long, unsigned long
+//__int64 or unsigned __int64
+template <typename T>
+struct Atomic 
+{
+    volatile T value;
+    
+    inline Atomic(){}
+    
+    explicit inline Atomic(T n)
+    { //so that it's globally visible (to use intel's terminology)
+        InterlockedExchange(&value,n);
+    }
+    
+    //if you need an atomic load then use (*this += 0)
+    //but I haven't found any algorithms that
+    //require an atomic load
+    inline operator T() const
+    {
+        return value;
+    }
+    
+    inline T CompareExchange(T exchange, T comperand)
+    {
+        return InterlockedCompareExchange(&value,exchange,comperand);
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare
+    inline T CompareSomeBitsAndExchange(T exchange, T comperand, T bitsToCompare)
+    {
+        T returned;
+        T bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return returned;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (comparand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return returned;
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and in the set
+    inline T CompareSomeBitsAndExchangeThoseBits(T exchange, T comperand, T bitsToCompare)
+    {
+        T returned = comperand;
+        T bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            exchange = (exchange&bitsToCompare) | (bitsToIgnore&returned);
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return returned;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (comparand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return returned;
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and others in the set
+    inline T CompareSomeBitsAndExchangeSomeOtherBits(T exchange, 
+        T bitsToExchange,
+        T comperand, 
+        T bitsToCompare
+        )
+    {
+        T returned = value;
+        T bitsToIgnore = ~bitsToCompare;
+        T bitsToLeave = ~bitsToExchange;
+        for (;;){
+            exchange = (exchange&bitsToExchange) | (bitsToLeave&returned);
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return returned;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (comparand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return returned;
+    }
+    
+    inline T Exchange(T exchange) 
+    {
+        return InterlockedExchange(&value,exchange);
+    }
+    
+    inline bool SetIfEqual(T exchange, T comperand)
+    {
+        return InterlockedSetIfEqual(&value,exchange,comperand);
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare
+    inline bool SetIfSomeBitsAreEqual(T exchange, T comperand, T bitsToCompare)
+    {
+        T returned;
+        T bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return true;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (comparand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return false;
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and in the set
+    inline bool SetSomeBitsIfThoseBitsAreEqual(T exchange, T comperand, T bitsToCompare)
+    {
+        T returned = comperand;
+        T bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            exchange = (exchange&bitsToCompare) | (bitsToIgnore&returned);
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return true;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comperand = (comperand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return false;
+    }
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and others in the set
+    inline bool SetSomeBitsIfSomeOtherBitsAreEqual(T exchange, 
+        T bitsToExchange,
+        T comperand, 
+        T bitsToCompare
+        )
+    {
+        T returned = value;
+        T bitsToIgnore = ~bitsToCompare;
+        T bitsToLeave = ~bitsToExchange;
+        for (;;){
+            exchange = (exchange&bitsToExchange) | (bitsToLeave&returned);
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return true;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comperand = (comperand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return false;
+    }
+    
+    inline T operator =(T exchange)
+    {
+        Exchange(exchange);
+        return exchange;
+    }
+    
+    inline T operator +=(T n)
+    {
+        return n + InterlockedExchangeAdd(&value, n);
+    }
+    
+    inline T operator -=(T n)
+    {
+        return InterlockedExchangeAdd(&value, -n) - n;
+    }
+    inline T operator ++()
+    {
+        return (*this += 1);
+    }
+    
+    inline T operator --() 
+    {
+        return (*this -= 1);
+    }
+    
+    inline T operator ++(int)
+    {
+        return InterlockedExchangeAdd(&value, (T)1);
+    }
+    
+    inline T operator --(int)
+    {
+        return InterlockedExchangeAdd(&value, (T)-1);
+    }
+    
+    inline T operator &=(T n)
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value; 
+            exchange = comperand & n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    inline T operator |= (T n) 
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value; 
+            exchange = comperand | n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    //yes this isn't standard, but it's useful
+    inline T Mask(T bitsToKeep, T bitsToSet) 
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value ; 
+            exchange = ((comperand & bitsToKeep) | bitsToSet);
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    inline T operator ^= (T n)
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value;  
+            exchange = comperand ^ n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    inline T operator *= (T n) 
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value;  
+            exchange = comperand * n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    inline T operator /= (T n) 
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value;
+            exchange = comperand / n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    inline T operator >>= (unsigned n) 
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value;
+            exchange = comperand >> n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    inline T operator <<= (unsigned n)
+    {
+        T comperand;
+        T exchange;
+        do {
+            comperand = value;
+            exchange = comperand << n;
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    inline bool BTS(int bit)
+    {
+        return InterlockedBTS(&value,bit);
+    }
+    inline bool BTC(int bit)
+    {
+        return InterlockedBTC(&value,bit);
+    }
+    inline bool BTR(int bit)
+    {
+        return InterlockedBTR(&value,bit);
+    }
+    inline bool BT(int bit)
+    {
+        return BT(&value,bit);
+    }
+}; 
+
+template <typename T>
+T * InterlockedCompareExchangePointer (
+                                       T *volatile *dest,
+                                       T * exchange,
+                                       T * comperand
+                                       )
+{
+    return (T*)InterlockedCompareExchangePointer((PVOID *)dest,(PVOID)exchange,(PVOID)comperand);
+}
+
+template <typename T>
+struct AtomicPtr 
+{
+    T * volatile value;
+    
+    inline AtomicPtr(){}
+    
+    explicit inline AtomicPtr(T *n)
+    { //so that it's globally visible (to use intel's terminology)
+        InterlockedExchange(&value,n);
+    }
+    explicit inline AtomicPtr(const T *n)
+    { //so that it's globally visible (to use intel's terminology)
+        InterlockedExchange(&value,(T *)n);
+    }
+    
+    inline operator T *() const
+    {
+        return value;
+    }
+    
+    inline operator const T *() const
+    {
+        return value;
+    }
+    
+    inline T *CompareExchange(T *exchange, T *comperand)
+    {
+        return InterlockedCompareExchangePointer(&value,exchange,comperand);
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare
+    inline T * CompareSomeBitsAndExchange(T * exchange, T * comperand, int bitsToCompare)
+    {
+        T  *returned;
+        int bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            returned = InterlockedCompareExchangePointer(&value,exchange,comperand);
+            if (returned == comperand) return returned;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (comparand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return returned;
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and in the set
+    inline T * CompareSomeBitsAndExchangeThoseBits(T * exchange, T * comperand, int bitsToCompare)
+    {
+        T *returned = comperand;
+        int bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            exchange = (T*)((int)exchange&bitsToCompare) | (bitsToIgnore&(int)returned);
+            returned = InterlockedCompareExchangePointer(&value,exchange,comperand);
+            if (returned == comperand) return returned;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (T*)((int)comparand&bitsToCompare) | (bitsToIgnore&(int)returned);
+        }
+        return returned;
+    }
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and others in the set
+    inline T* CompareSomeBitsAndExchangeSomeOtherBits(T *exchange, 
+        int bitsToExchange,
+        T * comperand, 
+        int bitsToCompare
+        )
+    {
+        T * returned = value;
+        int bitsToIgnore = ~bitsToCompare;
+        int bitsToLeave = ~bitsToExchange;
+        for (;;){
+            exchange = (T*)((int)exchange&bitsToExchange) | (bitsToLeave&(int)returned);
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return returned;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (T*)((int)comparand&bitsToCompare) | (bitsToIgnore&(int)returned);
+        }
+        return returned;
+    }
+    
+    inline T * operator ->() const
+    {
+        assert (value != NULL);
+        return value;
+    }
+    
+    inline T *Exchange(T *exchange) 
+    {
+        return InterlockedExchange(&value,exchange);
+    }
+    
+    inline bool SetIfEqual(T *exchange, T *comperand)
+    {
+        return comperand==InterlockedCompareExchangePointer(&value,exchange,comperand);
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare
+    inline bool SetIfSomeBitsAreEqual(T * exchange, T * comperand, int bitsToCompare)
+    {
+        T  *returned;
+        int bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            returned = InterlockedCompareExchangePointer(&value,exchange,comperand);
+            if (returned == comperand) return true;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (comparand&bitsToCompare) | (bitsToIgnore&returned);
+        }
+        return false;
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and in the set
+    inline bool SetSomeBitsIfThoseBitsAreEqual(T * exchange, T * comperand, int bitsToCompare)
+    {
+        T *returned = comperand;
+        int bitsToIgnore = ~bitsToCompare;
+        for (;;){
+            exchange = (T*)((int)exchange&bitsToCompare) | (bitsToIgnore&(int)returned);
+            returned = InterlockedCompareExchangePointer(&value,exchange,comperand);
+            if (returned == comperand) return true;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (T*)((int)comparand&bitsToCompare) | (bitsToIgnore&(int)returned);
+        }
+        return false;
+    }
+    
+    //useful - simulates having a compareexchange that ignores some bits in
+    //the compare and others in the set
+    inline bool SetSomeBitsIfSomeOtherBitsAreEqual(T *exchange, 
+        int bitsToExchange,
+        T * comperand, 
+        int bitsToCompare
+        )
+    {
+        T * returned = value;
+        int bitsToIgnore = ~bitsToCompare;
+        int bitsToLeave = ~bitsToExchange;
+        for (;;){
+            exchange = (T*)((int)exchange&bitsToExchange) | (bitsToLeave&(int)returned);
+            returned = InterlockedCompareExchange(&value,exchange,comperand);
+            if (returned == comperand) return true;
+            if (0 != ((returned ^ comperand) & bitsToCompare)) break;
+            comparand = (T*)((int)comparand&bitsToCompare) | (bitsToIgnore&(int)returned);
+        }
+        return false;
+    }
+    
+    inline T *operator =(T *exchange)
+    {
+        Exchange(exchange);
+        return exchange;
+    }
+    
+    template <typename INT_TYPE>
+        inline T *operator +=(INT_TYPE n)
+    {
+        return n + (T *)InterlockedExchangeAdd((LPLONG)&value, (LONG)(n * (LONG)sizeof(T)));
+    }
+    
+    template <typename INT_TYPE>
+        inline T *operator -=(INT_TYPE n)
+    {
+        return (T *)InterlockedExchangeAdd((LPLONG)&value, (LONG)(-n * (LONG)sizeof(T))) - n;
+    }
+    inline T *operator ++()
+    {
+        return (T *)InterlockedExchangeAdd((LPLONG)&value, (LONG)(sizeof(T))) + 1;
+    }
+    
+    inline T *operator --() 
+    {
+        return (T *)InterlockedExchangeAdd((LPLONG)&value, -(LONG)(sizeof(T))) - 1;
+    }
+    
+    inline T *operator ++(int)
+    {
+        return (T*)InterlockedExchangeAdd((LPLONG)&value, (LONG)sizeof(T));
+    }
+    
+    inline T *operator --(int)
+    {
+        return (T*)InterlockedExchangeAdd((LPLONG)&value, -(LONG)sizeof(T));
+    }
+    
+    //yes this isn't standard, but it's useful
+    template <typename INT_TYPE>
+        inline T *operator &=(INT_TYPE n)
+    {
+        T * comperand;
+        T * exchange;
+        do {
+            comperand = value; 
+            exchange = (T *)((LONG)comperand & n);
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    //yes this isn't standard, but it's useful
+    template <typename INT_TYPE>
+        inline T *operator |= (INT_TYPE n) 
+    {
+        T * comperand;
+        T * exchange;
+        do {
+            comperand = value; 
+            exchange = (T *)((LONG)comperand | n);
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    //yes this isn't standard, but it's useful
+    template <typename INT_TYPE>
+        inline T operator ^= (INT_TYPE n)
+    {
+        T * comperand;
+        T * exchange;
+        do {
+            comperand = value;  
+            exchange = (T *)((LONG)comperand ^ n);
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    //yes this isn't standard, but it's useful
+    template <typename INT_TYPE_A, typename INT_TYPE_B>
+        inline T * Mask(INT_TYPE_A bitsToKeep, INT_TYPE_B bitsToSet) 
+    {
+        T * comperand;
+        T * exchange;
+        do {
+            comperand = value ; 
+            exchange = (T *)(((LONG)comperand & bitsToKeep) | bitsToSet);
+        }while(!SetIfEqual(exchange, comperand));
+        return exchange;
+    }
+    
+    
+    inline bool BTS(int bit)
+    {
+        return InterlockedBTS((volatile int *)&value,bit);
+    }
+    inline bool BTC(int bit)
+    {
+        return InterlockedBTC((volatile int *)&value,bit);
+    }
+    inline bool BTR(int bit)
+    {
+        return InterlockedBTR((volatile int *)&value,bit);
+    }
+    inline bool BT(int bit)
+    {
+        return BT((volatile int *)&value,bit);
+    }
+};
+
+typedef Atomic<int> AtomicInt;
+typedef Atomic<unsigned int> AtomicUInt;
+typedef Atomic<__int64> AtomicInt64;
+typedef Atomic<unsigned __int64> AtomicUInt64;
+
+template
+<typename T, typename A = AtomicUInt64>
+struct AtomicUnion
+{
+    A whole;
+    
+    AtomicUnion()
+    {
+        assert(sizeof(T)<=sizeof(A));
+        new((void *)&whole) T();//in place new
+        
+    }
+    
+    ~AtomicUnion()
+    {
+        ((T *)&whole)->~T();
+    }
+    
+    T &Value() 
+    {
+        return *(T *)&whole;
+    }
+    const T &Value() const
+    {
+        return *(const T *)&whole;
+    }
+};
+
+
+template <typename T>
+struct AtomicPtrWithCountStruct
+{
+    AtomicPtr<T> ptr;
+    AtomicInt count;
+};
+
+
+template <typename T>
+struct PtrWithCountStruct
+{
+    T* ptr;
+    int count;
+};
+
+template <typename T, int MARKBITS=3>
+struct BitMarkedAtomicPtr : public AtomicPtr<T>
+{
+    const int MarkMask() const
+    {
+        return (1<<MARKBITS)-1;
+    }
+    
+    const int DataMask() const
+    {
+        return ~ThreadBitMask();
+    }
+    
+    static T * MaskPtr(T *data)
+    {
+        return (T*)((int)data & DataMask());
+    }
+    
+    static int MaskMark(T * data)
+    {
+        return ((int)data & MarkMask());
+    }
+    
+    static T* Mark(T * data, int bit)
+    {
+        return (T*)((int)data | 1<<bit);
+    }
+    
+    static T* Unmark(T * data, int bit)
+    {
+        return (T*)((int)data & ~(1<<bit));
+    }
+    
+    T * MaskPtr()
+    {
+        return MaskPtr(value);
+    }
+    
+    int MaskMark()
+    {
+        return MaskMark(value);
+    }
+    
+    //note new value has the marks in exchange not the original marks
+    inline bool SetIfMarked(T *exchange, int bit)
+    {
+        assert(bit<MARKBITS);
+        return SetIfSomeBitsAreEqual(exchange,DataMask(),(T*)(1<<bit),1<<bit);
+    }
+    
+    //note new value has the marks in exchange not the original marks
+    T * ExchangeIfMarked(T *exchange, int bit)
+    {
+        assert(bit<MARKBITS);
+        return CompareSomeBitsAndExchange(exchange,DataMask(),(T*)(1<<bit),1<<bit);
+    }
+    
+    inline bool SetAndClearMarksIfMarked(T *exchange, int bit)
+    {
+        return SetIfMarked(MaskPtr(exchange),bit);
+    }
+    
+    inline T * ExchangeAndClearMarksIfMarked(T *exchange, int bit)
+    {
+        return ExchangeIfMarked(MaskPtr(exchange),bit);
+    }
+    
+    inline bool SetAndClearOtherMarksIfMarked(T *exchange, int bit)
+    {
+        return SetIfMarked(Mark(MaskPtr(exchange),bit),
+            bit);
+    }
+    
+    //note new value has the marks in exchange not the original marks
+    T * ExchangeAndClearOtherMarksIfMarked(T *exchange, int bit)
+    {
+        return ExchangeIfMarked(Mark(MaskPtr(exchange),bit),
+            bit);
+    }
+    
+    inline bool SetAndMarkIfMarked(T *exchange, int bit)
+    {
+        return SetSomeBitsIfSomeOtherBitsAreEqual(
+            Mark(MaskPtr(exchange),bit),
+            DataMask() | 1<<bit,
+            (T*)(1<<bit),
+            1<<bit);
+    }
+    
+    inline T * ExchangeAndMarkIfMarked(T *exchange, int bit)
+    {
+        return CompareSomeBitsAndExchangeSomeOtherBits(
+            Mark(MaskPtr(exchange),bit),
+            DataMask() | 1<<bit,
+            (T*)(1<<bit),
+            1<<bit);
+    }	 
+    
+    bool Mark(int bit)
+    {
+        assert(bit<MARKBITS);
+        return BTS(bit);
+    }
+    
+    bool Unmark(int bit)
+    {
+        assert(bit<MARKBITS);
+        return BTC(bit);
+    }
+    
+    bool InvertMark(int bit)
+    {
+        assert(bit<MARKBITS);
+        return BTR(bit);
+    }
+    
+    bool IsMarked(int bit)
+    {
+        assert(bit<MARKBITS);
+        return BT(bit);
+    }
+};
+
+template <typename T>
+struct AtomicPtrWithCount : public AtomicUnion< AtomicPtrWithCountStruct<T> >
+{
+    typedef AtomicUnion< PtrWithCountStruct<T>, unsigned __int64 > SimpleUnionType;
+    AtomicPtrWithCount()
+    {
+        whole = 0;
+    }
+    AtomicPtrWithCount(unsigned __int64 o)
+    {
+        whole = o.whole;
+    }
+    AtomicPtrWithCount(T *ptr)
+    {
+        SimpleUnionType o;
+        o.Value().ptr = ptr;
+        o.Value().count = 0;
+        whole = o.whole;
+    }
+    
+    operator unsigned __int64() const
+    {
+        return whole;
+    }
+    T *Ptr() const
+    {
+        return Value().ptr;
+    }
+    int Count() const
+    {
+        return Value().count;
+    }
+    unsigned __int64 Whole() const
+    {
+        return whole;
+    }
+    
+    AtomicPtr<T> & Ptr()
+    {
+        return Value().ptr;
+    }
+    AtomicInt & Count()
+    {
+        return Value().count;
+    }
+    AtomicUInt64 & Whole()
+    {
+        return whole;
+    }
+    unsigned __int64 SetPtrAndIncCount(T *ptr)
+    {
+        SimpleUnionType was;
+        SimpleUnionType to;
+        to.Value().ptr = ptr;
+        do {
+            was.whole = whole;
+            to.Value().count = was.Value().count+1;
+        }while(!whole.SetIfEqual(to.whole,was.whole));
+        return to.whole;
+    }
+    bool SetIfPtrEqualAndIncCount(T *exchange, T *comperand)
+    {
+        SimpleUnionType was;
+        SimpleUnionType to;
+        to.ptr = exchange;
+        do {
+            was.whole = whole;
+            if (was.Value().ptr != comperand) return false;
+            to.Value().count = was.Value().count+1;
+        }while(!whole.SetIfEqual(to.whole,was.whole));
+        return true;
+    }
+    unsigned __int64 ExchangeIfPtrEqualAndIncCount(T *exchange, T *comperand)
+    {
+        SimpleUnionType was;
+        SimpleUnionType to;
+        to.ptr = exchange;
+        do {
+            was.whole = whole;
+            if (was.Value().ptr != comperand) return was.whole;
+            to.Value().count = was.Value().count+1;
+        }while(!whole.SetIfEqual(to.whole,was.whole));
+        return was.whole;
+    }
+    inline T *operator =(T *exchange)
+    {
+        SetPtrAndIncCount(exchange);
+        return exchange;
+    }
+    
+    template <typename INT_TYPE>
+        inline T *operator +=(INT_TYPE n)
+    {
+        T *was;
+        T *to;
+        do {
+            was = Ptr();
+            to = was + n;
+        }while (!SetIfPtrEqualAndIncCount(to,was));
+        return to;
+    }
+    
+    template <typename INT_TYPE>
+        inline T *operator -=(INT_TYPE n)
+    {
+        T *was;
+        T *to;
+        do {
+            was = Ptr();
+            to = was - n;
+        }while (!SetIfPtrEqualAndIncCount(to,was));
+        return to;
+    }
+    inline T *operator ++()
+    {
+        return (*this += 1);
+    }
+    
+    inline T *operator --() 
+    {
+        return (*this -= 1);
+    }
+    
+    inline T *operator ++(int)
+    {
+        return (++ *this) - 1;
+    }
+    
+    inline T *operator --(int)
+    {
+        return (-- *this) + 1;
+    }
+};
+
+template <typename T>
+struct AtomicPtrWithMarkStruct
+{
+    AtomicPtr<T> ptr;
+    AtomicUInt mark;
+};
+
+
+template <typename T>
+struct PtrWithMarkStruct
+{
+    T* ptr;
+    unsigned int mark;
+};
+
+template <typename T>
+struct AtomicPtrWithMark : public AtomicUnion< AtomicPtrWithMarkStruct<T> >
+{
+    typedef AtomicUnion< PtrWithMarkStruct<T>, unsigned __int64 > SimpleUnionType;
+    AtomicPtrWithMark()
+    {
+        whole = 0;
+    }
+    AtomicPtrWithMark(unsigned __int64 o)
+    {
+        whole = o.whole;
+    }
+    AtomicPtrWithMark(T *ptr)
+    {
+        SimpleUnionType o;
+        o.Value().ptr = ptr;
+        o.Value().mark = 0;
+        whole = o.whole;
+    }
+    
+    operator unsigned __int64() const
+    {
+        return whole;
+    }
+    T *Ptr() const
+    {
+        return Value().ptr;
+    }
+    int Mark() const
+    {
+        return Value().mark;
+    }
+    unsigned __int64 Whole() const
+    {
+        return whole;
+    }
+    
+    AtomicPtr<T> & Ptr()
+    {
+        return Value().ptr;
+    }
+    AtomicInt & Mark()
+    {
+        return Value().mark;
+    }
+    AtomicUInt64 & Whole()
+    {
+        return whole;
+    }
+    
+    inline bool SetAndClearOtherMarksIfMarked(T *exchange, int bit)
+    {
+        assert(bit<32);
+        SimpleUnionType compareMask;
+        compareMask.Value().ptr = NULL;
+        compareMask.Value().mark = 1u<<bit;
+        
+        SimpleUnionType exchangeValue;
+        exchangeValue.Value().ptr = exchange;
+        exchangeValue.Value().mark = compareMask.Value().mark;
+        
+        return whole.SetSomeBitsIfSomeOtherBitsAreEqual(exchangeValue.whole,
+            (unsigned __int64)-1i64,
+            compareMask.whole,
+            compareMask.whole);
+    }
+    inline bool SetAndClearMarksIfPtrEqual(T *exchange, T*comparend)
+    {
+        SimpleUnionType compareMask;
+        compareMask.Value().ptr = (T*)-1;
+        compareMask.Value().mark = 0;
+        
+        SimpleUnionType compareValue;
+        compareValue.Value().ptr = comparend;
+        compareValue.Value().mark = 0;
+        
+        SimpleUnionType exchangeValue;
+        exchangeValue.Value().ptr = exchange;
+        exchangeValue.Value().mark = 0;
+        
+        return whole.SetSomeBitsIfThoseBitsAreEqual(exchangeValue.whole,
+            compareValue.whole,
+            compareMask.whole);
+    }
+    
+    inline bool SetAndClearMarksIfMarked(T *exchange, int bit)
+    {
+        assert(bit<32);
+        SimpleUnionType compareMask;
+        compareMask.Value().ptr = NULL;
+        compareMask.Value().mark = 1u<<bit;
+        
+        SimpleUnionType exchangeValue;
+        exchangeValue.Value().ptr = exchange;
+        exchangeValue.Value().mark = 0;
+        
+        return whole.SetSomeBitsIfSomeOtherBitsAreEqual(exchangeValue.whole,
+            (unsigned __int64)-1i64,
+            compareMask.whole,
+            compareMask.whole);
+    }
+    //note new value has the marks in exchange not the original marks
+    unsigned __int64 ExchangeAndClearOtherMarksIfMarked(T *exchange, int bit)
+    {
+        assert(bit<32);
+        SimpleUnionType compareMask;
+        compareMask.Value().ptr = NULL;
+        compareMask.Value().mark = 1u<<bit;
+        
+        SimpleUnionType exchangeValue;
+        exchangeValue.Value().ptr = exchange;
+        exchangeValue.Value().mark = compareMask.Value().mark;
+        
+        return whole.CompareSomeBitsAndExchangeSomeOtherBits(exchangeValue.whole,
+            exchangeMask.whole,
+            compareMask.whole,
+            compareMask.whole);
+    }
+    
+    unsigned __int64 ExchangeAndClearMarksIfMarked(T *exchange, int bit)
+    {
+        assert(bit<32);
+        SimpleUnionType compareMask;
+        compareMask.Value().ptr = NULL;
+        compareMask.Value().mark = 1u<<bit;
+        
+        SimpleUnionType exchangeValue;
+        exchangeValue.Value().ptr = exchange;
+        exchangeValue.Value().mark = 0;
+        
+        return whole.CompareSomeBitsAndExchangeSomeOtherBits(exchangeValue.whole,
+            exchangeMask.whole,
+            compareMask.whole,
+            compareMask.whole);
+    }
+    
+    
+    bool Mark(int bit)
+    {
+        assert(bit<32);
+        return whole.BTS(bit);
+    }
+    
+    bool Unmark(int bit)
+    {
+        assert(bit<32);
+        return whole.BTC(bit);
+    }
+    
+    bool InvertMark(int bit)
+    {
+        assert(bit<32);
+        return whole.BTR(bit);
+    }
+    
+    bool IsMarked(int bit)
+    {
+        assert(bit<32);
+        return whole.BT(bit);
+    }
+};
+
+struct MPStackElement
+{
+    MPStackElement * next;
+};
+
+class MPMarkStack
+{
+protected:
+    AtomicPtrWithMark<MPStackElement> tos;
+    AtomicUInt availableThreadBits;
+    AtomicInt lastReserved;
+    
+public:
+    
+    MPStackElement* ExchangeStack(MPStackElement *newStack = NULL)
+    {
+        AtomicPtrWithMark<MPStackElement>::SimpleUnionType newWhole, ret;
+        newWhole.Value().mark = 0;
+        newWhole.Value().ptr = newStack;
+        ret.whole = tos.whole.Exchange(newWhole.whole);
+        return ret.Value().ptr;
+    }
+    
+    MPMarkStack()
+    {
+        lastReserved = 0;
+        tos.whole = 0;
+        availableThreadBits = (unsigned)-1;
+    }
+    
+    //expensive - call once per thread to reserve a bit
+    int ReserveThreadBit()
+    {
+        int offset = lastReserved;
+        //since locked operations are slow we only do
+        //bit tests on bits that look acceptable because
+        //of a nonlocked read.
+        unsigned readOnce = availableThreadBits;
+        for (int i=0; i< 32;++i){
+            const int bit = (31&(i+offset));
+            const unsigned mask = 1u<<bit;
+            if ((readOnce & mask)!=0){
+                if (availableThreadBits.BTR(bit)) {
+                    lastReserved = ((bit+1)&31);//try the next one
+                    return bit+1;
+                }
+                readOnce = availableThreadBits;
+            }
+        }
+        return 0;
+    }
+    int BlockingReserveThreadBit()
+    {
+        int i;
+        while (0==(i=ReserveThreadBit()));
+        return i;
+    }
+    int ReturnThreadBit(int i)
+    {
+        availableThreadBits.BTS(i-1);
+    }
+    
+    void PushElement(MPStackElement *element)
+    {
+        MPStackElement * next;
+        do {
+            next = tos.Value().ptr;
+            element->next = tos.Value().ptr;
+        }while(!tos.SetAndClearMarksIfPtrEqual(element,next));
+    }
+    
+    void PushList(MPStackElement *top, MPStackElement *bottom)
+    {
+        MPStackElement * next;
+        do {
+            next = tos.Value().ptr;
+            bottom->next = tos.Value().ptr;
+        }while(!tos.SetAndClearMarksIfPtrEqual(top,next));
+    }
+    
+    MPStackElement* PopElement(int i)
+    {
+        for(;;) {
+            tos.Value().mark.BTS(i-1);
+            MPStackElement * was = tos.Value().ptr;
+            if (was == NULL) return NULL;
+            if (tos.SetAndClearMarksIfMarked(was->next,i-1)) return was;
+        }
+    }
+    MPStackElement* PopElement()
+    {
+        int id = BlockingReserveThreadBit();
+        MPStackElement * ret = PopElement(id);
+        ReturnThreadBit(id);
+        return ret;
+    }
+};
+class MPCountStack
+{
+protected:
+    AtomicPtrWithCount<MPStackElement> tos;
+    
+public:
+    
+    MPStackElement* ExchangeStack(MPStackElement *newStack = NULL)
+    {
+        return tos.Value().ptr.Exchange(newStack);
+    }
+    
+    MPCountStack()
+    {
+        tos = 0;
+    }
+    
+    
+    void PushElement(MPStackElement *element)
+    {
+        MPStackElement * next;
+        do {
+            next = tos.Value().ptr;
+            element->next = tos.Value().ptr;
+        }while(!tos.Value().ptr.SetIfEqual(element,next));
+    }
+    
+    void PushList(MPStackElement *top, MPStackElement *bottom)
+    {
+        MPStackElement * next;
+        do {
+            next = tos.Value().ptr;
+            bottom->next = tos.Value().ptr;
+        }while(!tos.Value().ptr.SetIfEqual(top,next));
+    }
+    
+    MPStackElement* PopElement()
+    {
+        AtomicPtrWithCount<MPStackElement>::SimpleUnionType was;
+        AtomicPtrWithCount<MPStackElement>::SimpleUnionType to;
+        do {
+            was.whole = tos.whole;
+            if (was.Value().ptr == NULL) return NULL;
+            to.Value().count = was.Value().count + 1;
+            to.Value().ptr = was.Value().ptr->next;
+        }while(!tos.whole.SetIfEqual(was.whole,to.whole));
+        return was.Value().ptr;
+    }
+};
+
+class SimpleStack
+{
+protected:
+    MPStackElement * tos;
+    
+public:
+    
+    MPStackElement* ExchangeStack(MPStackElement *newStack = NULL)
+    {
+        MPStackElement* was = tos;
+        tos = newStack;
+        return was;
+    }
+    
+    SimpleStack()
+    {
+        tos = 0;
+    }
+    
+    
+    void PushElement(MPStackElement *element)
+    {
+        element->next = tos;
+        tos = element;
+    }
+    
+    void PushList(MPStackElement *top, MPStackElement *bottom)
+    {
+        bottom->next = tos;
+        tos = top;
+    }
+    
+    MPStackElement* PopElement()
+    {
+        MPStackElement* was = tos;
+        if (was) tos = was->next;
+        return was;
+    }
+};
+
+template <typename T>
+struct MPMemBlock : public MPStackElement
+{
+    T data;
+    char bottomOfBlock;
+};
+
+#define OFFSET_OF_MEMBER(type,member) ((int)&(((type *)0)->member))
+
+//put one of these in each thread object 
+//and you have thread local allocation
+template <typename T, typename BLOCK_ALLOCATOR>
+struct SimpleBlockAllocator
+{
+    SimpleStack data;
+    
+    typedef SimpleBlockAllocator<T,BLOCK_ALLOCATOR> AllocatorType;
+    
+    BLOCK_ALLOCATOR & MyAllocator;
+    
+    SimpleBlockAllocator(BLOCK_ALLOCATOR &source)
+        :MyAllocator(source)
+    {}
+    
+    int ReserveThreadBit(){return 1;}
+    int BlockingReserveThreadBit(){return 1;}
+    void ReturnThreadBit(int){  }
+    
+    int Size() const { return sizeof(T); }
+    void AddBlock()
+    {
+        int blockSize = MyAllocator.Size();
+        int elementSize = sizeof(MPMemBlock<T>);
+        assert( MyAllocator.Size() >= elementSize);
+        MPMemBlock<T> *bottom = (MPMemBlock<T> *)MyAllocator.Allocate();
+        void *prev = 0;
+        MPMemBlock<T> *top = bottom;  
+        do {
+            top->bottomOfBlock = 0;
+            top->next = prev;
+            prev = (void *)top;
+            ++top;
+        }while( (blockSize-=elementSize) >= elementSize);
+        bottom->bottomOfBlock = 1;
+        data.PushList(top-1,bottom);
+    }
+    void *Allocate(int) 
+    {	
+        return Allocate();
+    }
+    void *Allocate() 
+    {	
+        MPMemBlock<T> *ret;
+        for(;;) {
+            ret = (MPMemBlock<T> *)data.PopElement();
+            if (ret!=NULL) return &ret->data;
+            AddBlock();
+        }
+    }
+    void Deallocate(void *ob) 
+    {
+        if (!ob) return;
+        MPMemBlock<T> *ret = (MPMemBlock<T> *)
+            ((char *)data - OFFSET_OF_MEMBER(MPMemBlock<T>,ob));
+        data.PushElement(ret);
+    }
+    //all memory allocated must be Deallocated before Clear() or the destructor is called
+    void Clear() 
+    {
+        MPMemBlock<T> *stackWas = (MPMemBlock<T> *)data.ExchangeStack(NULL);
+        MPMemBlock<T> *blocks = NULL;
+        while(stackWas){
+            if (stackWas->bottomOfBlock){
+                if (blocks) {
+                    blocks->next = stackWas;
+                }
+                blocks = stackWas;
+            }
+            stackWas = (MPMemBlock<T> *)(stackWas->next);
+        }
+        while(blocks)
+        {
+            stackWas = (MPMemBlock<T> *)blocks->next;
+            MyAllocator.Deallocate(blocks);
+            blocks = stackWas;
+        }
+    }
+    //all memory allocated must be Deallocated before Clear() or the destructor is called
+    ~SimpleBlockAllocator()
+    {
+        Clear();
+    }
+};
+
+template<class T>
+struct SimpleAllocator
+{
+    void *Allocate() { return (void *)new char[sizeof(T)]; }
+    void *Allocate(int) { return Allocate(); }
+    
+    int ReserveThreadBit(){return 1;}
+    int BlockingReserveThreadBit(){return 1;}
+    void ReturnThreadBit(int){}
+    
+    void Deallocate(void *data) { delete[] (char[])data; }
+    int Size() const { return sizeof(T); }
+    typedef SimpleAllocator AllocatorType;
+};
+
+enum DontReserve{ DONT_RESERVE };
+
+template<class T>
+struct ThreadMarkedAllocator
+{
+    T& allocator;
+    int bit;
+    
+    ThreadMarkedAllocator(T &sourceAllocator)
+        :allocator(sourceAllocator)
+        ,bit(sourceAllocator.BlockingReserveThreadBit())
+    {}
+    
+    ThreadMarkedAllocator(T &sourceAllocator,DontReserve)
+        :allocator(sourceAllocator)
+        ,bit(0)
+    {}
+    
+    ~ThreadMarkedAllocator()
+    {
+        allocator.ReturnThreadBit(bit);
+    }
+    
+    bool ReserveThreadBit()
+    {
+        if (bit==0) bit=sourceAllocator.ReserveThreadBit();
+        return bit!=0;
+    }
+    void BlockingReserveThreadBit()
+    {
+        if (bit==0) bit=sourceAllocator.BlockingReserveThreadBit();
+    }
+    void ReturnThreadBit()
+    {
+        int was = bit;
+        bit = 0;
+        sourceAllocator.ReturnThreadBit(was);
+    }
+    
+    void *Allocate() 
+    { 
+        assert(bit!=0);
+        return allocator.Allocate(bit); 
+    }
+    
+    void Deallocate(void *data) { allocator.Deallocate(data); }
+    int Size() const { return Allocator.Size(); }
+    typedef ThreadMarkedAllocator<T> AllocatorType;
+};
+
+template <typename T, typename BLOCK_ALLOCATOR>
+struct MPMarkFreeListAllocator
+{
+    MPMarkStack data;
+    
+    typedef MPMarkFreeListAllocator<T,BLOCK_ALLOCATOR> AllocatorType;
+    
+    BLOCK_ALLOCATOR &MyAllocator;
+    
+    MPMarkFreeListAllocator(BLOCK_ALLOCATOR &source)
+        :MyAllocator(source)
+    {}
+    
+    int ReserveThreadBit(){return data.ReserveThreadBit();}
+    int BlockingReserveThreadBit(){return data.BlockingReserveThreadBit();}
+    void ReturnThreadBit(int bit){ data.ReturnThreadBit(bit); }
+    
+    int Size() const { return sizeof(T); }
+    void AddBlock()
+    {
+        int blockSize = MyAllocator.Size();
+        int elementSize = sizeof(MPMemBlock<T>);
+        assert( MyAllocator.Size() >= elementSize);
+        MPMemBlock<T> *bottom = (MPMemBlock<T> *)MyAllocator.Allocate();
+        void *prev = 0;
+        MPMemBlock<T> *top = bottom;  
+        do {
+            top->bottomOfBlock = 0;
+            top->next = prev;
+            prev = (void *)top;
+            ++top;
+        }while( (blockSize-=elementSize) >= elementSize);
+        bottom->bottomOfBlock = 1;
+        data.PushList(top-1,bottom);
+    }
+    void *Allocate(int mark) 
+    {	
+        MPMemBlock<T> *ret;
+        for(;;) {
+            ret = (MPMemBlock<T> *)data.PopElement(mark);
+            if (ret!=NULL) return ret;
+            AddBlock();
+        }
+    }
+    void *Allocate() 
+    {	
+        MPMemBlock<T> *ret;
+        for(;;) {
+            ret = (MPMemBlock<T> *)data.PopElement();
+            if (ret!=NULL) return &ret->data;
+            AddBlock();
+        }
+    }
+    void Deallocate(void *ob) 
+    {
+        if (!ob) return;
+        MPMemBlock<T> *ret = (MPMemBlock<T> *)
+            ((char *)data - OFFSET_OF_MEMBER(MPMemBlock<T>,ob));
+        data.PushElement(ret);
+    }
+    //all memory allocated must be Deallocated before Clear() or the destructor is called
+    void Clear()
+    {
+        MPMemBlock<T> *stackWas = (MPMemBlock<T> *)data.ExchangeStack(NULL);
+        MPMemBlock<T> *blocks = NULL;
+        while(stackWas){
+            if (stackWas->bottomOfBlock){
+                if (blocks) {
+                    blocks->next = stackWas;
+                }
+                blocks = stackWas;
+            }
+            stackWas = (MPMemBlock<T> *)(stackWas->next);
+        }
+        while(blocks)
+        {
+            stackWas = (MPMemBlock<T> *)blocks->next;
+            MyAllocator.Deallocate(blocks);
+            blocks = stackWas;
+        }
+    }
+    //all memory allocated must be Deallocated before Clear() or the destructor is called
+    ~MPMarkFreeListAllocator()
+    {
+        Clear();
+    }
+};
+template <typename T, typename BLOCK_ALLOCATOR>
+struct MPCountFreeListAllocator
+{
+    MPCountStack data;
+    
+    typedef MPCountFreeListAllocator<T,BLOCK_ALLOCATOR> AllocatorType;
+    
+    BLOCK_ALLOCATOR & MyAllocator;
+    
+    MPCountFreeListAllocator(BLOCK_ALLOCATOR &source)
+        :MyAllocator(source)
+    {}
+    
+    int ReserveThreadBit(){return 1;}
+    int BlockingReserveThreadBit(){return 1;}
+    void ReturnThreadBit(int){  }
+    
+    int Size() const { return sizeof(T); }
+    void AddBlock()
+    {
+        int blockSize = MyAllocator.Size();
+        int elementSize = sizeof(MPMemBlock<T>);
+        assert( MyAllocator.Size() >= elementSize);
+        MPMemBlock<T> *bottom = (MPMemBlock<T> *)MyAllocator.Allocate();
+        void *prev = 0;
+        MPMemBlock<T> *top = bottom;  
+        do {
+            top->bottomOfBlock = 0;
+            top->next = prev;
+            prev = (void *)top;
+            ++top;
+        }while( (blockSize-=elementSize) >= elementSize);
+        bottom->bottomOfBlock = 1;
+        data.PushList(top-1,bottom);
+    }
+    void *Allocate(int) 
+    {	
+        return Allocate();
+    }
+    void *Allocate() 
+    {	
+        MPMemBlock<T> *ret;
+        for(;;) {
+            ret = (MPMemBlock<T> *)data.PopElement();
+            if (ret!=NULL) return &ret->data;
+            AddBlock();
+        }
+    }
+    void Deallocate(void *ob) 
+    {
+        if (!ob) return;
+        MPMemBlock<T> *ret = (MPMemBlock<T> *)
+            ((char *)data - OFFSET_OF_MEMBER(MPMemBlock<T>,ob));
+        data.PushElement(ret);
+    }
+    //all memory allocated must be Deallocated before Clear() or the destructor is called
+    void Clear() 
+    {
+        MPMemBlock<T> *stackWas = (MPMemBlock<T> *)data.ExchangeStack(NULL);
+        MPMemBlock<T> *blocks = NULL;
+        while(stackWas){
+            if (stackWas->bottomOfBlock){
+                if (blocks) {
+                    blocks->next = stackWas;
+                }
+                blocks = stackWas;
+            }
+            stackWas = (MPMemBlock<T> *)(stackWas->next);
+        }
+        while(blocks)
+        {
+            stackWas = (MPMemBlock<T> *)blocks->next;
+            MyAllocator.Deallocate(blocks);
+            blocks = stackWas;
+        }
+    }
+    //all memory allocated must be Deallocated before Clear() or the destructor is called
+    ~MPCountFreeListAllocator()
+    {
+        Clear();
+    }
+};
+
+struct Counted
+{
+    mutable AtomicInt refCount;
+    Counted():refCount(0){}
+};
+
+//it is NOT safe to read or write from more than one thread at a time
+template <typename T> 
+class CountedPtr
+{
+    
+protected:
+    T *value;
+    void decRefCount()
+    {
+        decRefCount(value);
+    }
+public:
+    static void decRefCount(T *ptr)
+    {
+        if (ptr && 0 == --ptr->refCount){
+            delete ptr;
+        }
+    }
+    void SetValueIgnoringRefCounts(T *ptr) 
+    { 
+        value = ptr; 
+    }
+    CountedPtr():letter(NULL){}
+    
+    CountedPtr(T *ptr)
+    {
+        value = ptr;
+        if (value) ++value->refCount;
+    }
+    
+    operator const T *() const { return value; }
+    operator T *() { return value; }
+    
+    CountedPtr<T> & operator=(const T *ptr)
+    {
+        if (ptr) ++ptr->refCount;
+        decRefCount();
+        value = ptr;
+        return *this;
+    }
+    
+    operator bool() const
+    { return value!=NULL; }
+    
+    bool operator !() const
+    { return value==NULL; }
+    
+    
+    const T & operator*() const
+    { return *value; }
+    
+    T & operator*()
+    { return *value; }
+    
+    T * operator->()
+    { assert(value); return value; }
+    const T * operator->()const
+    { assert(value); return value; }
+    
+    ~CountedPtr()
+    {
+        decRefCount();
+    }
+};
+
+template <typename T>
+class AtomicCountedPtr
+{
+protected:
+    mutable AtomicPtrWithCount<T> value;
+public:
+    
+    
+    CountedPtr<T> & LoadValue(CountedPtr<T> &dest) const 
+    { 
+retry:
+    AtomicPtrWithCount<T>::SimpleUnionType inc;
+    inc.Value().ptr = 0;
+    inc.Value().count = 1;
+    AtomicPtrWithCount<T>::SimpleUnionType ret;
+    ret.whole = (value.whole+=inc.whole); //increment in ptr
+    dest = ret.Value().ptr ; //increment in object
+    AtomicPtrWithCount<T>::SimpleUnionType update;
+    update.Value().ptr = ret.Value().ptr;
+    for(;;){
+        update.Value().count = ret.Value().count-1;
+        AtomicPtrWithCount<T>::SimpleUnionType current;
+        current.whole = value.whole.SetIfEqual(update.whole,ret.whole);
+        if (current.whole == ret.whole) break; //successfully decremented in ptr
+        if (current.Value().ptr != ret.Value().ptr) || current.Value().count < 1){
+            //because of the line "dest = ret.Value().ptr" above
+            //the old value will decrement in the prev object as the next object is loaded
+            goto retry; 
+        }
+        ret.Value().count = current.Value().count;
+    }
+    return dest; 
+    }
+    
+    CountedPtr<T> & LoadValue(CountedPtr<T> &dest, int &count) const 
+    { 
+retry:
+    AtomicPtrWithCount<T>::SimpleUnionType inc;
+    inc.Value().ptr = 0;
+    inc.Value().count = 1;
+    AtomicPtrWithCount<T>::SimpleUnionType ret;
+    ret.whole = (value.whole+=inc.whole); //increment in ptr
+    dest = ret.Value().ptr ; //increment in object
+    AtomicPtrWithCount<T>::SimpleUnionType update;
+    update.Value().ptr = ret.Value().ptr;
+    for(;;){
+        update.Value().count = ret.Value().count-1;
+        AtomicPtrWithCount<T>::SimpleUnionType current;
+        current.whole = value.whole.SetIfEqual(update.whole,ret.whole);
+        if (current.whole == ret.whole) break; //successfully decremented in ptr
+        if (current.Value().ptr != ret.Value().ptr) || current.Value().count < 1){
+            //because of the line "dest = ret.Value().ptr" above
+            //the old value will decrement in the prev object as the next object is loaded
+            goto retry; 
+        }
+        ret.Value().count = current.Value().count;
+    }
+    count = ret.Value().count;
+    return dest; 
+    }
+    
+    CountedPtr<T> & ExchangeInPlace(CountedPtr<T> &dest)
+    { 
+        AtomicPtrWithCount<T>::SimpleUnionType destWhole;
+        destWhole.Value().ptr = dest;
+        destWhole.Value().count = 0;
+        destWhole.whole = value.whole.Exchange(destWhole.whole);
+        if (destWhole.Value().ptr != NULL && destWhole.Value().count != 0) 
+            destWhole.Value().ptr->refCount += destWhole.Value().count;
+        dest.SetValueIgnoringRefCounts(destWhole.Value().ptr);
+        return dest; 
+    }
+    
+    CountedPtr<T> & Exchange(CountedPtr<T> &dest, T *source)
+    { 
+        dest = source;
+        return ExchangeInPlace(dest);
+    }
+    
+    bool CompareExchange(CountedPtr<T> &dest, T *exchange, T * comparend)
+    { 
+        CountedPtr<T> holdExchange(exchange);
+retry:
+        AtomicPtrWithCount<T>::SimpleUnionType comp;
+        if (LoadValue(dest,comp.Value().count) != comparend) return false;
+        comp.Value().ptr = dest;
+        AtomicPtrWithCount<T>::SimpleUnionType exch;
+        exch.Value().ptr=exchange;
+        exch.Value().count = 0;
+        AtomicPtrWithCount<T>::SimpleUnionType ret;
+        for (;;){
+            ret.whole = value.whole.CompareExchange(exch.whole,comp.whole);
+            if (ret.whole == comp.whole) {
+                //transfer increment from holdExchange to this
+                holdExchange.SetValueIgnoringRefCounts(NULL);
+                //had inc from Load and the one from this 
+                if (dest) {
+                    const int dec = 1-comp.Value().count;
+                    if (dec!=0) dest->refCount -= dec;
+                }
+                return true;
+            }
+            if (ret.Value().ptr != comp.Value().ptr) {
+                //cant return the new pointer because there's no
+                //reason to think it hasn't been collected already
+                goto retry;
+            }
+            comp.Value().count = ret.Value().count;
+        }
+    }
+    
+    bool SetIfEqual(T *exchange, T * comparend)
+    { 
+        CountedPtr<T> holdExchange(exchange);
+retry:
+        AtomicPtrWithCount<T>::SimpleUnionType comp;
+        comp.whole = value.whole;
+        if (comp.Value().ptr != comparend) return false;
+        AtomicPtrWithCount<T>::SimpleUnionType exch;
+        exch.Value().ptr=exchange;
+        exch.Value().count = 0;
+        AtomicPtrWithCount<T>::SimpleUnionType ret;
+        for (;;){
+            ret.whole = value.whole.CompareExchange(exch.whole,comp.whole);
+            if (ret.whole == comp.whole) {
+                if (comp.Value().ptr) {
+                    const int dec = 1-comp.Value().count;
+                    if (dec!=0 && 0==(comp.Value().ptr->refCount -= dec)){
+                        delete comp.Value().ptr;
+                    };
+                }
+                //transfer increment from holdExchange to this
+                holdExchange.SetValueIgnoringRefCounts(NULL);
+                return true;
+            }
+            if (ret.Value().ptr != comp.Value().ptr) {
+                return false;
+            }
+            comp.Value().count = ret.Value().count;
+        }
+    }
+    
+    void operator = (const T *ptr)
+    {
+        CountedPtr<T> dest(ptr);
+        ExchangeInPlace(dest);
+    }
+    
+    //not recommended because value won't be consistant throughout 
+    //a given expression
+    operator bool() const
+    { return value.Value().ptr!=NULL; }
+    
+    //not recommended because value won't be consistant throughout 
+    //a given expression
+    bool operator !() const
+    { return value.Value().ptr==NULL; }
+    
+    
+    AtomicCountedPtr()
+    {}
+    
+    ~AtomicCountedPtr()
+    {
+        *this = (T*)NULL;
+    }
+};
+
+class MPQueueBase
+{
+    AtomicUInt * data;
+    int len;
+    
+    struct AtomicHeadAndTail{
+        AtomicInt head;
+        AtomicInt tail;
+    };
+    
+    AtomicUnion<AtomicHeadAndTail> headAndTail;
+    
+    struct HeadAndTailStruct{
+        int head;
+        int tail;
+    };
+    union HeadAndTailUnion
+    {
+        HeadAndTailStruct value;
+        unsigned __int64 whole;
+    };
+    
+    AtomicUInt & Index(int i)
+    {
+        return data[i & (len-1)];
+    }
+    bool IsNil(int i)
+    {
+        return (i & 1) != 0;
+    }
+    
+    public:
+        
+        int Used() const
+        {
+            HeadAndTailUnion ht;
+            ht.whole = headAndTail.whole;
+            return ht.value.head-ht.value.tail;
+            
+        }
+        bool Empty() const
+        {
+            HeadAndTailUnion ht;
+            ht.whole = headAndTail.whole;
+            return ht.value.head==ht.value.tail;
+        }
+        int Available() const
+        {
+            return MaxLen()-Used();
+            
+        }
+        int MaxLen() const
+        {
+            return len;
+        }
+        
+        MPQueueBase(int lenLn2)
+            :len(1<<lenLn2)
+        {
+            headAndTail.whole = 0;
+            data = new AtomicUInt[len];
+            for (int i=0;i<len;++i) data[i] = 1;
+        }
+        
+        ~MPQueueBase()
+        {
+            delete [] data;
+        }
+        
+        bool Put(int v)
+        {
+            assert((v&1)==0);
+            for(;;){
+                HeadAndTailUnion ht;
+                ht.whole = headAndTail.whole;
+                if (ht.value.head-ht.value.tail >= len) return false;
+                
+                int wasThere = Index(ht.value.head);
+                
+                if (IsNil(wasThere)){
+                    if (Index(ht.value.head).SetIfEqual(v,wasThere)){
+                        headAndTail.Value().head.SetIfEqual(1+ht.value.head,ht.value.head);
+                        return true;
+                    }
+                }
+                headAndTail.Value().head.SetIfEqual(1+ht.value.head,ht.value.head);
+            }
+        }
+        int Get()
+        {
+            for(;;){
+                HeadAndTailUnion ht;
+                ht.whole = headAndTail.whole;
+                if (ht.value.head-ht.value.tail < 1) return 0;
+                
+                int wasThere = Index(ht.value.tail);
+                int newNil = (ht.value.tail|1);
+                
+                if (!IsNil(wasThere)){
+                    if (Index(ht.value.tail).SetIfEqual(newNil,wasThere)){
+                        headAndTail.Value().tail.SetIfEqual(1+ht.value.tail,ht.value.tail);
+                        return wasThere;
+                    }
+                }
+                headAndTail.Value().tail.SetIfEqual(1+ht.value.tail,ht.value.tail);
+            }
+        }
+};
+
+template <typename T>
+class MPQueue 
+{
+    MPQueueBase queue;
+public:
+    MPQueue(int lenLn2)
+        :queue(lenLn2)
+    {}
+    T *Get()
+    {
+        return (T*)queue.Get();
+    }
+    bool Put(T * v)
+    {
+        return queue.Put((int)v);
+    }
+    bool Empty() const
+    {
+        return queue.Empty();
+    }
+    int Used() const
+    {
+        return queue.Used();
+        
+    }
+    int Available() const
+    {
+        return queue.Available();
+        
+    }
+    int MaxLen() const
+    {
+        return queue.MaxLen();
+    }
+};
+
+class QueueBase
+{
+    unsigned * data;
+    int len;
+    
+    int head;
+    int tail;
+    
+    unsigned & Index(int i)
+    {
+        return data[i & (len-1)];
+    }
+    
+public:
+    
+    int Used() const
+    {
+        return head-tail;
+        
+    }
+    bool Empty() const
+    {
+        return head==tail;
+        
+    }
+    
+    int Available() const
+    {
+        return MaxLen()-Used();
+        
+    }
+    int MaxLen() const
+    {
+        return len;
+    }
+    
+    QueueBase(int lenLn2)
+        :len(1<<lenLn2)
+        ,head(0)
+        ,tail(0)
+    {
+        data = new unsigned[len];
+        for (int i=0;i<len;++i) data[i] = 1;
+    }
+    
+    ~QueueBase()
+    {
+        delete [] data;
+    }
+    
+    bool Put(int v)
+    {
+        if (Available() < 1) return false;
+        Index(head++) = v;
+        return true;
+    }
+    
+    int Get()
+    {
+        if (Used() < 1) return 0;
+        return Index(tail++);
+    }
+};
+template <typename T>
+class Queue 
+{
+    QueueBase queue;
+public:
+    Queue(int lenLn2)
+        :queue(lenLn2)
+    {}
+    T *Get()
+    {
+        return (T*)queue.Get();
+    }
+    bool Put(T * v)
+    {
+        return queue.Put((int)v);
+    }
+    bool Empty() const
+    {
+        return queue.Empty();
+    }
+    int Used() const
+    {
+        return queue.Used();
+        
+    }
+    int Available() const
+    {
+        return queue.Available();
+        
+    }
+    int MaxLen() const
+    {
+        return queue.MaxLen();
+    }
+};
+
+
+#endif