# # # Nimrod's Runtime Library # (c) Copyright 2010 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # # Low level allocator for Nimrod. Has been designed to support the GC. # TODO: # - eliminate "used" field # - make searching for block O(1) const debugGC = false # we wish to debug the GC... logGC = false traceGC = false # extensive debugging reallyDealloc = true # for debugging purposes this can be set to false cycleGC = true # (de)activate the cycle GC stressGC = false reallyOsDealloc = true coalescRight = true coalescLeft = true overwriteFree = false # Page size of the system; in most cases 4096 bytes. For exotic OS or # CPU this needs to be changed: const PageShift = 12 PageSize = 1 shl PageShift PageMask = PageSize-1 MemAlign = 8 # also minimal allocatable memory block BitsPerPage = PageSize div MemAlign UnitsPerPage = BitsPerPage div (sizeof(int)*8) # how many ints do we need to describe a page: # on 32 bit systems this is only 16 (!) TrunkShift = 9 BitsPerTrunk = 1 shl TrunkShift # needs to be power of 2 and divisible by 64 TrunkMask = BitsPerTrunk - 1 IntsPerTrunk = BitsPerTrunk div (sizeof(int)*8) IntShift = 5 + ord(sizeof(int) == 8) # 5 or 6, depending on int width IntMask = 1 shl IntShift - 1 proc raiseOutOfMem() {.noreturn.} = quit("out of memory") # ------------ platform specific chunk allocation code ----------------------- when defined(posix): const PROT_READ = 1 # page can be read PROT_WRITE = 2 # page can be written MAP_PRIVATE = 2 # Changes are private when defined(linux) or defined(aix): const MAP_ANONYMOUS = 0x20 # don't use a file elif defined(macosx) or defined(bsd): const MAP_ANONYMOUS = 0x1000 elif defined(solaris): const MAP_ANONYMOUS = 0x100 else: {.error: "Port memory manager to your platform".} proc mmap(adr: pointer, len: int, prot, flags, fildes: cint, off: int): pointer {.header: "".} proc munmap(adr: pointer, len: int) {.header: "".} proc osAllocPages(size: int): pointer {.inline.} = result = mmap(nil, size, PROT_READ or PROT_WRITE, MAP_PRIVATE or MAP_ANONYMOUS, -1, 0) if result == nil or result == cast[pointer](-1): raiseOutOfMem() proc osDeallocPages(p: pointer, size: int) {.inline} = when reallyOsDealloc: munmap(p, size) elif defined(windows): const MEM_RESERVE = 0x2000 MEM_COMMIT = 0x1000 MEM_TOP_DOWN = 0x100000 PAGE_READWRITE = 0x04 MEM_DECOMMIT = 0x4000 MEM_RELEASE = 0x8000 proc VirtualAlloc(lpAddress: pointer, dwSize: int, flAllocationType, flProtect: int32): pointer {. header: "", stdcall.} proc VirtualFree(lpAddress: pointer, dwSize: int, dwFreeType: int32) {.header: "", stdcall.} proc osAllocPages(size: int): pointer {.inline.} = result = VirtualAlloc(nil, size, MEM_RESERVE or MEM_COMMIT, PAGE_READWRITE) if result == nil: raiseOutOfMem() proc osDeallocPages(p: pointer, size: int) {.inline.} = # according to Microsoft, 0 is the only correct value here: when reallyOsDealloc: VirtualFree(p, 0, MEM_RELEASE) else: {.error: "Port memory manager to your platform".} # --------------------- end of non-portable code ----------------------------- # We manage *chunks* of memory. Each chunk is a multiple of the page size. # Each chunk starts at an address that is divisible by the page size. Chunks # that are bigger than ``ChunkOsReturn`` are returned back to the operating # system immediately. const ChunkOsReturn = 256 * PageSize InitialMemoryRequest = ChunkOsReturn div 2 # < ChunkOsReturn! SmallChunkSize = PageSize type PTrunk = ptr TTrunk TTrunk {.final.} = object next: PTrunk # all nodes are connected with this pointer key: int # start address at bit 0 bits: array[0..IntsPerTrunk-1, int] # a bit vector TTrunkBuckets = array[0..1023, PTrunk] TIntSet {.final.} = object data: TTrunkBuckets type TAlignType = biggestFloat TFreeCell {.final, pure.} = object next: ptr TFreeCell # next free cell in chunk (overlaid with refcount) zeroField: int # 0 means cell is not used (overlaid with typ field) # 1 means cell is manually managed pointer PChunk = ptr TBaseChunk PBigChunk = ptr TBigChunk PSmallChunk = ptr TSmallChunk TBaseChunk {.pure.} = object prevSize: int # size of previous chunk; for coalescing size: int # if < PageSize it is a small chunk used: bool # later will be optimized into prevSize... TSmallChunk = object of TBaseChunk next, prev: PSmallChunk # chunks of the same size freeList: ptr TFreeCell free: int # how many bytes remain acc: int # accumulator for small object allocation data: TAlignType # start of usable memory TBigChunk = object of TBaseChunk # not necessarily > PageSize! next: PBigChunk # chunks of the same (or bigger) size prev: PBigChunk align: int data: TAlignType # start of usable memory template smallChunkOverhead(): expr = sizeof(TSmallChunk)-sizeof(TAlignType) template bigChunkOverhead(): expr = sizeof(TBigChunk)-sizeof(TAlignType) proc roundup(x, v: int): int {.inline.} = result = (x + (v-1)) and not (v-1) assert(result >= x) #return ((-x) and (v-1)) +% x assert(roundup(14, PageSize) == PageSize) assert(roundup(15, 8) == 16) assert(roundup(65, 8) == 72) # ------------- chunk table --------------------------------------------------- # We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk # endings of big chunks. This is needed by the merging operation. The only # remaining operation is best-fit for big chunks. Since there is a size-limit # for big chunks (because greater than the limit means they are returned back # to the OS), a fixed size array can be used. type PLLChunk = ptr TLLChunk TLLChunk {.pure.} = object ## *low-level* chunk size: int # remaining size acc: int # accumulator TAllocator {.final, pure.} = object llmem: PLLChunk currMem, maxMem, freeMem: int # memory sizes (allocated from OS) freeSmallChunks: array[0..SmallChunkSize div MemAlign-1, PSmallChunk] freeChunksList: PBigChunk # XXX make this a datastructure with O(1) access chunkStarts: TIntSet proc incCurrMem(a: var TAllocator, bytes: int) {.inline.} = inc(a.currMem, bytes) proc decCurrMem(a: var TAllocator, bytes: int) {.inline.} = a.maxMem = max(a.maxMem, a.currMem) dec(a.currMem, bytes) proc getMaxMem(a: var TAllocator): int = # Since we update maxPagesCount only when freeing pages, # maxPagesCount may not be up to date. Thus we use the # maximum of these both values here: return max(a.currMem, a.maxMem) var allocator: TAllocator proc llAlloc(a: var TAllocator, size: int): pointer = # *low-level* alloc for the memory managers data structures. Deallocation # is never done. if a.llmem == nil or size > a.llmem.size: var request = roundup(size+sizeof(TLLChunk), PageSize) a.llmem = cast[PLLChunk](osAllocPages(request)) incCurrMem(a, request) a.llmem.size = request - sizeof(TLLChunk) a.llmem.acc = sizeof(TLLChunk) result = cast[pointer](cast[TAddress](a.llmem) + a.llmem.acc) dec(a.llmem.size, size) inc(a.llmem.acc, size) zeroMem(result, size) proc IntSetGet(t: TIntSet, key: int): PTrunk = var it = t.data[key and high(t.data)] while it != nil: if it.key == key: return it it = it.next result = nil proc IntSetPut(t: var TIntSet, key: int): PTrunk = result = IntSetGet(t, key) if result == nil: result = cast[PTrunk](llAlloc(allocator, sizeof(result^))) result.next = t.data[key and high(t.data)] t.data[key and high(t.data)] = result result.key = key proc Contains(s: TIntSet, key: int): bool = var t = IntSetGet(s, key shr TrunkShift) if t != nil: var u = key and TrunkMask result = (t.bits[u shr IntShift] and (1 shl (u and IntMask))) != 0 else: result = false proc Incl(s: var TIntSet, key: int) = var t = IntSetPut(s, key shr TrunkShift) var u = key and TrunkMask t.bits[u shr IntShift] = t.bits[u shr IntShift] or (1 shl (u and IntMask)) proc Excl(s: var TIntSet, key: int) = var t = IntSetGet(s, key shr TrunkShift) if t != nil: var u = key and TrunkMask t.bits[u shr IntShift] = t.bits[u shr IntShift] and not (1 shl (u and IntMask)) proc ContainsOrIncl(s: var TIntSet, key: int): bool = var t = IntSetGet(s, key shr TrunkShift) if t != nil: var u = key and TrunkMask result = (t.bits[u shr IntShift] and (1 shl (u and IntMask))) != 0 if not result: t.bits[u shr IntShift] = t.bits[u shr IntShift] or (1 shl (u and IntMask)) else: Incl(s, key) result = false # ------------- chunk management ---------------------------------------------- proc pageIndex(c: PChunk): int {.inline.} = result = cast[TAddress](c) shr PageShift proc pageIndex(p: pointer): int {.inline.} = result = cast[TAddress](p) shr PageShift proc pageAddr(p: pointer): PChunk {.inline.} = result = cast[PChunk](cast[TAddress](p) and not PageMask) assert(Contains(allocator.chunkStarts, pageIndex(result))) var lastSize = PageSize proc requestOsChunks(a: var TAllocator, size: int): PBigChunk = incCurrMem(a, size) inc(a.freeMem, size) result = cast[PBigChunk](osAllocPages(size)) assert((cast[TAddress](result) and PageMask) == 0) #zeroMem(result, size) result.next = nil result.prev = nil result.used = false result.size = size # update next.prevSize: var nxt = cast[TAddress](result) +% size assert((nxt and PageMask) == 0) var next = cast[PChunk](nxt) if pageIndex(next) in a.chunkStarts: #echo("Next already allocated!") next.prevSize = size # set result.prevSize: var prv = cast[TAddress](result) -% lastSize assert((nxt and PageMask) == 0) var prev = cast[PChunk](prv) if pageIndex(prev) in a.chunkStarts and prev.size == lastSize: #echo("Prev already allocated!") result.prevSize = lastSize else: result.prevSize = 0 # unknown lastSize = size # for next request proc freeOsChunks(a: var TAllocator, p: pointer, size: int) = # update next.prevSize: var c = cast[PChunk](p) var nxt = cast[TAddress](p) +% c.size assert((nxt and PageMask) == 0) var next = cast[PChunk](nxt) if pageIndex(next) in a.chunkStarts: next.prevSize = 0 # XXX used excl(a.chunkStarts, pageIndex(p)) osDeallocPages(p, size) decCurrMem(a, size) dec(a.freeMem, size) #c_fprintf(c_stdout, "[Alloc] back to OS: %ld\n", size) proc isAccessible(p: pointer): bool {.inline.} = result = Contains(allocator.chunkStarts, pageIndex(p)) proc contains[T](list, x: T): bool = var it = list while it != nil: if it == x: return true it = it.next when false: proc writeFreeList(a: TAllocator) = var it = a.freeChunksList c_fprintf(c_stdout, "freeChunksList: %p\n", it) while it != nil: c_fprintf(c_stdout, "it: %p, next: %p, prev: %p\n", it, it.next, it.prev) it = it.next proc ListAdd[T](head: var T, c: T) {.inline.} = assert(c notin head) assert c.prev == nil assert c.next == nil c.next = head if head != nil: assert head.prev == nil head.prev = c head = c proc ListRemove[T](head: var T, c: T) {.inline.} = assert(c in head) if c == head: head = c.next assert c.prev == nil if head != nil: head.prev = nil else: assert c.prev != nil c.prev.next = c.next if c.next != nil: c.next.prev = c.prev c.next = nil c.prev = nil proc isSmallChunk(c: PChunk): bool {.inline.} = return c.size <= SmallChunkSize-smallChunkOverhead() #return c.size < SmallChunkSize proc chunkUnused(c: PChunk): bool {.inline.} = result = not c.used proc updatePrevSize(a: var TAllocator, c: PBigChunk, prevSize: int) {.inline.} = var ri = cast[PChunk](cast[TAddress](c) +% c.size) assert((cast[TAddress](ri) and PageMask) == 0) if isAccessible(ri): ri.prevSize = prevSize proc freeBigChunk(a: var TAllocator, c: PBigChunk) = var c = c assert(c.size >= PageSize) inc(a.freeMem, c.size) when coalescRight: var ri = cast[PChunk](cast[TAddress](c) +% c.size) assert((cast[TAddress](ri) and PageMask) == 0) if isAccessible(ri) and chunkUnused(ri): assert(not isSmallChunk(ri)) if not isSmallChunk(ri): ListRemove(a.freeChunksList, cast[PBigChunk](ri)) inc(c.size, ri.size) excl(a.chunkStarts, pageIndex(ri)) when coalescLeft: if c.prevSize != 0: var le = cast[PChunk](cast[TAddress](c) -% c.prevSize) assert((cast[TAddress](le) and PageMask) == 0) if isAccessible(le) and chunkUnused(le): assert(not isSmallChunk(le)) if not isSmallChunk(le): ListRemove(a.freeChunksList, cast[PBigChunk](le)) inc(le.size, c.size) excl(a.chunkStarts, pageIndex(c)) c = cast[PBigChunk](le) if c.size < ChunkOsReturn: incl(a.chunkStarts, pageIndex(c)) updatePrevSize(a, c, c.size) ListAdd(a.freeChunksList, c) c.used = false else: freeOsChunks(a, c, c.size) proc splitChunk(a: var TAllocator, c: PBigChunk, size: int) = var rest = cast[PBigChunk](cast[TAddress](c) +% size) assert(rest notin a.freeChunksList) # c_fprintf(c_stdout, "to add: %p\n", rest) # writeFreeList(allocator) # assert false rest.size = c.size - size rest.used = false rest.next = nil rest.prev = nil rest.prevSize = size updatePrevSize(a, c, rest.size) c.size = size incl(a.chunkStarts, pageIndex(rest)) ListAdd(a.freeChunksList, rest) proc getBigChunk(a: var TAllocator, size: int): PBigChunk = # use first fit for now: assert((size and PageMask) == 0) assert(size > 0) result = a.freeChunksList block search: while result != nil: #if not chunkUnused(result): # c_fprintf(c_stdout, "%lld\n", int(result.used)) assert chunkUnused(result) if result.size == size: ListRemove(a.freeChunksList, result) break search elif result.size > size: #c_fprintf(c_stdout, "res size: %lld; size: %lld\n", result.size, size) ListRemove(a.freeChunksList, result) splitChunk(a, result, size) break search result = result.next assert result != a.freeChunksList if size < InitialMemoryRequest: result = requestOsChunks(a, InitialMemoryRequest) splitChunk(a, result, size) else: result = requestOsChunks(a, size) result.prevSize = 0 # XXX why is this needed? result.used = true incl(a.chunkStarts, pageIndex(result)) dec(a.freeMem, size) proc getSmallChunk(a: var TAllocator): PSmallChunk = var res = getBigChunk(a, PageSize) assert res.prev == nil assert res.next == nil result = cast[PSmallChunk](res) # ----------------------------------------------------------------------------- proc getCellSize(p: pointer): int {.inline.} = var c = pageAddr(p) result = c.size proc rawAlloc(a: var TAllocator, requestedSize: int): pointer = assert(roundup(65, 8) == 72) assert requestedSize >= sizeof(TFreeCell) var size = roundup(requestedSize, MemAlign) #c_fprintf(c_stdout, "alloc; size: %ld; %ld\n", requestedSize, size) if size <= SmallChunkSize-smallChunkOverhead(): # allocate a small block: for small chunks, we use only its next pointer var s = size div MemAlign var c = a.freeSmallChunks[s] if c == nil: c = getSmallChunk(a) c.freeList = nil assert c.size == PageSize c.size = size c.acc = size c.free = SmallChunkSize - smallChunkOverhead() - size c.next = nil c.prev = nil ListAdd(a.freeSmallChunks[s], c) result = addr(c.data) assert((cast[TAddress](result) and (MemAlign-1)) == 0) else: assert c.next != c #if c.size != size: # c_fprintf(c_stdout, "csize: %lld; size %lld\n", c.size, size) assert c.size == size if c.freeList == nil: assert(c.acc + smallChunkOverhead() + size <= SmallChunkSize) result = cast[pointer](cast[TAddress](addr(c.data)) +% c.acc) inc(c.acc, size) else: result = c.freeList assert(c.freeList.zeroField == 0) c.freeList = c.freeList.next dec(c.free, size) assert((cast[TAddress](result) and (MemAlign-1)) == 0) if c.free < size: ListRemove(a.freeSmallChunks[s], c) else: size = roundup(requestedSize+bigChunkOverhead(), PageSize) # allocate a large block var c = getBigChunk(a, size) assert c.prev == nil assert c.next == nil assert c.size == size result = addr(c.data) assert((cast[TAddress](result) and (MemAlign-1)) == 0) assert(isAccessible(result)) proc rawDealloc(a: var TAllocator, p: pointer) = var c = pageAddr(p) if isSmallChunk(c): # `p` is within a small chunk: var c = cast[PSmallChunk](c) var s = c.size var f = cast[ptr TFreeCell](p) #echo("setting to nil: ", $cast[TAddress](addr(f.zeroField))) assert(f.zeroField != 0) f.zeroField = 0 f.next = c.freeList c.freeList = f when overwriteFree: # set to 0xff to check for usage after free bugs: c_memset(cast[pointer](cast[int](p) +% sizeof(TFreeCell)), -1'i32, s -% sizeof(TFreeCell)) # check if it is not in the freeSmallChunks[s] list: if c.free < s: assert c notin a.freeSmallChunks[s div memAlign] # add it to the freeSmallChunks[s] array: ListAdd(a.freeSmallChunks[s div memAlign], c) inc(c.free, s) else: inc(c.free, s) if c.free == SmallChunkSize-smallChunkOverhead(): ListRemove(a.freeSmallChunks[s div memAlign], c) c.size = SmallChunkSize freeBigChunk(a, cast[PBigChunk](c)) else: # set to 0xff to check for usage after free bugs: when overwriteFree: c_memset(p, -1'i32, c.size -% bigChunkOverhead()) # free big chunk freeBigChunk(a, cast[PBigChunk](c)) proc isAllocatedPtr(a: TAllocator, p: pointer): bool = if isAccessible(p): var c = pageAddr(p) if not chunkUnused(c): if isSmallChunk(c): var c = cast[PSmallChunk](c) var offset = (cast[TAddress](p) and PageMask) -% smallChunkOverhead() result = (c.acc >% offset) and (offset %% c.size == 0) and (cast[ptr TFreeCell](p).zeroField >% 1) else: var c = cast[PBigChunk](c) result = p == addr(c.data) and cast[ptr TFreeCell](p).zeroField >% 1 # ---------------------- interface to programs ------------------------------- when true: proc alloc(size: int): pointer = result = rawAlloc(allocator, size+sizeof(TFreeCell)) cast[ptr TFreeCell](result).zeroField = 2 # mark it as used #assert(not isAllocatedPtr(allocator, result)) result = cast[pointer](cast[TAddress](result) +% sizeof(TFreeCell)) proc alloc0(size: int): pointer = result = talloc.alloc(size) zeroMem(result, size) proc dealloc(p: pointer) = var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell)) assert(cast[ptr TFreeCell](x).zeroField == 2) rawDealloc(allocator, x) assert(not isAllocatedPtr(allocator, x)) proc isAllocatedPtr(p: pointer): bool = var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell)) result = isAllocatedPtr(allocator, x) proc ptrSize(p: pointer): int = var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell)) result = pageAddr(x).size - sizeof(TFreeCell) proc realloc(p: pointer, newsize: int): pointer = if newsize > 0: result = talloc.alloc(newsize) if p != nil: copyMem(result, p, ptrSize(p)) talloc.dealloc(p) elif p != nil: talloc.dealloc(p) proc countFreeMem(): int = # only used for assertions var it = allocator.freeChunksList while it != nil: inc(result, it.size) it = it.next proc getFreeMem(): int = result = allocator.freeMem #assert(result == countFreeMem()) proc getTotalMem(): int = return allocator.currMem proc getOccupiedMem(): int = return talloc.getTotalMem() - talloc.getFreeMem() when isMainModule: const iterations = 4000_000 incl(allocator.chunkStarts, 11) assert 11 in allocator.chunkStarts excl(allocator.chunkStarts, 11) assert 11 notin allocator.chunkStarts var p: array [1..iterations, pointer] for i in 4..70: var x = i * 8 for j in 1.. iterations: p[j] = talloc.Alloc(x) for j in 1..iterations: assert isAllocatedPtr(p[j]) echo($i, " used memory: ", $(allocator.currMem)) for j in countdown(iterations, 1): #echo("j: ", $j) talloc.dealloc(p[j]) assert(not isAllocatedPtr(allocator, p[j])) echo($i, " after freeing: ", $(allocator.currMem))