diff options
-rw-r--r-- | compiler/ccgexprs.nim | 22 | ||||
-rw-r--r-- | compiler/liftdestructors.nim | 8 | ||||
-rw-r--r-- | lib/core/typeinfo.nim | 4 | ||||
-rw-r--r-- | lib/system/arc.nim | 47 | ||||
-rw-r--r-- | lib/system/bitmasks.nim | 6 | ||||
-rw-r--r-- | lib/system/deepcopy.nim | 2 | ||||
-rw-r--r-- | lib/system/memalloc.nim | 81 | ||||
-rw-r--r-- | lib/system/orc.nim | 2 | ||||
-rw-r--r-- | lib/system/seqs_v2.nim | 15 | ||||
-rw-r--r-- | tests/arc/thard_alignment.nim | 146 | ||||
-rw-r--r-- | tests/collections/thardalignmentconstraint.nim | 17 |
11 files changed, 273 insertions, 77 deletions
diff --git a/compiler/ccgexprs.nim b/compiler/ccgexprs.nim index b6caec760..1eb6caeb3 100644 --- a/compiler/ccgexprs.nim +++ b/compiler/ccgexprs.nim @@ -1278,11 +1278,11 @@ proc rawGenNew(p: BProc, a: var TLoc, sizeExpr: Rope; needsInit: bool) = if optTinyRtti in p.config.globalOptions: if needsInit: - b.r = ropecg(p.module, "($1) #nimNewObj($2)", - [getTypeDesc(p.module, typ), sizeExpr]) + b.r = ropecg(p.module, "($1) #nimNewObj($2, NIM_ALIGNOF($3))", + [getTypeDesc(p.module, typ), sizeExpr, getTypeDesc(p.module, bt)]) else: - b.r = ropecg(p.module, "($1) #nimNewObjUninit($2)", - [getTypeDesc(p.module, typ), sizeExpr]) + b.r = ropecg(p.module, "($1) #nimNewObjUninit($2, NIM_ALIGNOF($3))", + [getTypeDesc(p.module, typ), sizeExpr, getTypeDesc(p.module, bt)]) genAssignment(p, a, b, {}) else: let ti = genTypeInfoV1(p.module, typ, a.lode.info) @@ -2191,14 +2191,10 @@ proc genDestroy(p: BProc; n: PNode) = of tySequence: var a: TLoc initLocExpr(p, arg, a) - if optThreads in p.config.globalOptions: - linefmt(p, cpsStmts, "if ($1.p && !($1.p->cap & NIM_STRLIT_FLAG)) {$n" & - " #deallocShared($1.p);$n" & - "}$n", [rdLoc(a)]) - else: - linefmt(p, cpsStmts, "if ($1.p && !($1.p->cap & NIM_STRLIT_FLAG)) {$n" & - " #dealloc($1.p);$n" & - "}$n", [rdLoc(a)]) + linefmt(p, cpsStmts, "if ($1.p && !($1.p->cap & NIM_STRLIT_FLAG)) {$n" & + " #alignedDealloc($1.p, NIM_ALIGNOF($2));$n" & + "}$n", + [rdLoc(a), getTypeDesc(p.module, t.lastSon)]) else: discard "nothing to do" else: let t = n[1].typ.skipTypes(abstractVar) @@ -2217,7 +2213,7 @@ proc genDispose(p: BProc; n: PNode) = if elemType.destructor != nil: var destroyCall = newNodeI(nkCall, n.info) genStmts(p, destroyCall) - lineCg(p, cpsStmts, ["#nimRawDispose($#)", rdLoc(a)]) + lineFmt(p, cpsStmts, "#nimRawDispose($1, NIM_ALIGNOF($2))", [rdLoc(a), getTypeDesc(p.module, elemType)]) else: # ``nimRawDisposeVirtual`` calls the ``finalizer`` which is the same as the # destructor, but it uses the runtime type. Afterwards the memory is freed: diff --git a/compiler/liftdestructors.nim b/compiler/liftdestructors.nim index 30094875c..9b6c179db 100644 --- a/compiler/liftdestructors.nim +++ b/compiler/liftdestructors.nim @@ -515,7 +515,9 @@ proc atomicRefOp(c: var TLiftCtx; t: PType; body, x, y: PNode) = if isFinal(elemType): addDestructorCall(c, elemType, actions, genDeref(x, nkDerefExpr)) - actions.add callCodegenProc(c.g, "nimRawDispose", c.info, x) + var alignOf = genBuiltin(c.g, mAlignOf, "alignof", newNodeIT(nkType, c.info, elemType)) + alignOf.typ = getSysType(c.g, c.info, tyInt) + actions.add callCodegenProc(c.g, "nimRawDispose", c.info, x, alignOf) else: addDestructorCall(c, elemType, newNodeI(nkStmtList, c.info), genDeref(x, nkDerefExpr)) actions.add callCodegenProc(c.g, "nimDestroyAndDispose", c.info, x) @@ -638,7 +640,9 @@ proc ownedRefOp(c: var TLiftCtx; t: PType; body, x, y: PNode) = if isFinal(elemType): addDestructorCall(c, elemType, actions, genDeref(x, nkDerefExpr)) - actions.add callCodegenProc(c.g, "nimRawDispose", c.info, x) + var alignOf = genBuiltin(c.g, mAlignOf, "alignof", newNodeIT(nkType, c.info, elemType)) + alignOf.typ = getSysType(c.g, c.info, tyInt) + actions.add callCodegenProc(c.g, "nimRawDispose", c.info, x, alignOf) else: addDestructorCall(c, elemType, newNodeI(nkStmtList, c.info), genDeref(x, nkDerefExpr)) actions.add callCodegenProc(c.g, "nimDestroyAndDispose", c.info, x) diff --git a/lib/core/typeinfo.nim b/lib/core/typeinfo.nim index 9fd8ebf8c..5c4e1b601 100644 --- a/lib/core/typeinfo.nim +++ b/lib/core/typeinfo.nim @@ -108,7 +108,7 @@ when not defined(gcDestructors): proc newSeq(typ: PNimType, len: int): pointer {.importCompilerProc.} proc objectInit(dest: pointer, typ: PNimType) {.importCompilerProc.} else: - proc nimNewObj(size: int): pointer {.importCompilerProc.} + proc nimNewObj(size, align: int): pointer {.importCompilerProc.} proc newSeqPayload(cap, elemSize, elemAlign: int): pointer {.importCompilerProc.} proc prepareSeqAdd(len: int; p: pointer; addlen, elemSize, elemAlign: int): pointer {. importCompilerProc.} @@ -178,7 +178,7 @@ proc invokeNew*(x: Any) = ## performs ``new(x)``. `x` needs to represent a ``ref``. assert x.rawType.kind == tyRef when defined(gcDestructors): - cast[ppointer](x.value)[] = nimNewObj(x.rawType.base.size) + cast[ppointer](x.value)[] = nimNewObj(x.rawType.base.size, x.rawType.base.align) else: var z = newObj(x.rawType, x.rawType.base.size) genericAssign(x.value, addr(z), x.rawType) diff --git a/lib/system/arc.nim b/lib/system/arc.nim index 46579eaef..0eecadd66 100644 --- a/lib/system/arc.nim +++ b/lib/system/arc.nim @@ -75,14 +75,13 @@ when defined(nimArcDebug): elif defined(nimArcIds): var gRefId: int -proc nimNewObj(size: int): pointer {.compilerRtl.} = - let s = size + sizeof(RefHeader) +proc nimNewObj(size, alignment: int): pointer {.compilerRtl.} = + let hdrSize = align(sizeof(RefHeader), alignment) + let s = size + hdrSize when defined(nimscript): discard - elif compileOption("threads"): - result = allocShared0(s) +! sizeof(RefHeader) else: - result = alloc0(s) +! sizeof(RefHeader) + result = alignedAlloc0(s, alignment) +! hdrSize when defined(nimArcDebug) or defined(nimArcIds): head(result).refId = gRefId atomicInc gRefId @@ -92,20 +91,18 @@ proc nimNewObj(size: int): pointer {.compilerRtl.} = when traceCollector: cprintf("[Allocated] %p result: %p\n", result -! sizeof(RefHeader), result) -proc nimNewObjUninit(size: int): pointer {.compilerRtl.} = +proc nimNewObjUninit(size, alignment: int): pointer {.compilerRtl.} = # Same as 'newNewObj' but do not initialize the memory to zero. # The codegen proved for us that this is not necessary. - let s = size + sizeof(RefHeader) + let hdrSize = align(sizeof(RefHeader), alignment) + let s = size + hdrSize when defined(nimscript): discard - elif compileOption("threads"): - var orig = cast[ptr RefHeader](allocShared(s)) else: - var orig = cast[ptr RefHeader](alloc(s)) - orig.rc = 0 + result = cast[ptr RefHeader](alignedAlloc0(s, alignment) +! hdrSize) + head(result).rc = 0 when defined(gcOrc): - orig.rootIdx = 0 - result = orig +! sizeof(RefHeader) + head(result).rootIdx = 0 when defined(nimArcDebug): head(result).refId = gRefId atomicInc gRefId @@ -147,7 +144,7 @@ when not defined(nimscript) and defined(nimArcDebug): else: result = 0 -proc nimRawDispose(p: pointer) {.compilerRtl.} = +proc nimRawDispose(p: pointer, alignment: int) {.compilerRtl.} = when not defined(nimscript): when traceCollector: cprintf("[Freed] %p\n", p -! sizeof(RefHeader)) @@ -155,27 +152,21 @@ proc nimRawDispose(p: pointer) {.compilerRtl.} = if head(p).rc >= rcIncrement: cstderr.rawWrite "[FATAL] dangling references exist\n" quit 1 - - when defined(gcOrc) and defined(nimArcDebug): - if (head(p).rc and 0b100) != 0: - cstderr.rawWrite "[FATAL] cycle root freed\n" - quit 1 - when defined(nimArcDebug): # we do NOT really free the memory here in order to reliably detect use-after-frees if freedCells.data == nil: init(freedCells) freedCells.incl head(p) - elif compileOption("threads"): - deallocShared(p -! sizeof(RefHeader)) else: - dealloc(p -! sizeof(RefHeader)) + let hdrSize = align(sizeof(RefHeader), alignment) + alignedDealloc(p -! hdrSize, alignment) -template dispose*[T](x: owned(ref T)) = nimRawDispose(cast[pointer](x)) +template dispose*[T](x: owned(ref T)) = nimRawDispose(cast[pointer](x), T.alignOf) #proc dispose*(x: pointer) = nimRawDispose(x) proc nimDestroyAndDispose(p: pointer) {.compilerRtl, raises: [].} = - let d = cast[ptr PNimTypeV2](p)[].destructor - if d != nil: cast[DestructorProc](d)(p) + let rti = cast[ptr PNimTypeV2](p) + if rti.destructor != nil: + cast[DestructorProc](rti.destructor)(p) when false: cstderr.rawWrite cast[ptr PNimTypeV2](p)[].name cstderr.rawWrite "\n" @@ -183,7 +174,7 @@ proc nimDestroyAndDispose(p: pointer) {.compilerRtl, raises: [].} = cstderr.rawWrite "bah, nil\n" else: cstderr.rawWrite "has destructor!\n" - nimRawDispose(p) + nimRawDispose(p, rti.align) when defined(gcOrc): when defined(nimThinout): @@ -216,7 +207,7 @@ proc GC_unref*[T](x: ref T) = if nimDecRefIsLast(cast[pointer](x)): # XXX this does NOT work for virtual destructors! `=destroy`(x[]) - nimRawDispose(cast[pointer](x)) + nimRawDispose(cast[pointer](x), T.alignOf) proc GC_ref*[T](x: ref T) = ## New runtime only supports this operation for 'ref T'. diff --git a/lib/system/bitmasks.nim b/lib/system/bitmasks.nim index 922ad5fb7..d7c55a4d9 100644 --- a/lib/system/bitmasks.nim +++ b/lib/system/bitmasks.nim @@ -15,7 +15,11 @@ const PageSize = 1 shl PageShift PageMask = PageSize-1 - MemAlign = 16 # also minimal allocatable memory block + MemAlign = # also minimal allocatable memory block + when defined(useMalloc): + when defined(amd64): 16 + else: 8 + else: 16 BitsPerPage = PageSize div MemAlign UnitsPerPage = BitsPerPage div (sizeof(int)*8) diff --git a/lib/system/deepcopy.nim b/lib/system/deepcopy.nim index b9dc594fa..1f30b8427 100644 --- a/lib/system/deepcopy.nim +++ b/lib/system/deepcopy.nim @@ -163,7 +163,7 @@ proc genericDeepCopyAux(dest, src: pointer, mt: PNimType; tab: var PtrTable) = when defined(nimSeqsV2): let typ = if mt.base.kind == tyObject: cast[PNimType](cast[ptr PNimTypeV2](s2)[].typeInfoV1) else: mt.base - let z = nimNewObj(typ.size) + let z = nimNewObj(typ.size, typ.align) cast[PPointer](dest)[] = z else: # this version should work for any other GC: diff --git a/lib/system/memalloc.nim b/lib/system/memalloc.nim index 142762fe7..5f4304502 100644 --- a/lib/system/memalloc.nim +++ b/lib/system/memalloc.nim @@ -304,6 +304,87 @@ when hasAlloc and not defined(js): ## or other memory may be corrupted. deallocShared(p) + include bitmasks + + template `+!`(p: pointer, s: SomeInteger): pointer = + cast[pointer](cast[int](p) +% int(s)) + + template `-!`(p: pointer, s: SomeInteger): pointer = + cast[pointer](cast[int](p) -% int(s)) + + proc alignedAlloc(size, align: Natural): pointer = + if align <= MemAlign: + when compileOption("threads"): + result = allocShared(size) + else: + result = alloc(size) + else: + # allocate (size + align - 1) necessary for alignment, + # plus 2 bytes to store offset + when compileOption("threads"): + let base = allocShared(size + align - 1 + sizeof(uint16)) + else: + let base = alloc(size + align - 1 + sizeof(uint16)) + # memory layout: padding + offset (2 bytes) + user_data + # in order to deallocate: read offset at user_data - 2 bytes, + # then deallocate user_data - offset + let offset = align - (cast[int](base) and (align - 1)) + cast[ptr uint16](base +! (offset - sizeof(uint16)))[] = uint16(offset) + result = base +! offset + + proc alignedAlloc0(size, align: Natural): pointer = + if align <= MemAlign: + when compileOption("threads"): + result = allocShared0(size) + else: + result = alloc0(size) + else: + # see comments for alignedAlloc + when compileOption("threads"): + let base = allocShared0(size + align - 1 + sizeof(uint16)) + else: + let base = alloc0(size + align - 1 + sizeof(uint16)) + let offset = align - (cast[int](base) and (align - 1)) + cast[ptr uint16](base +! (offset - sizeof(uint16)))[] = uint16(offset) + result = base +! offset + + proc alignedDealloc(p: pointer, align: int) {.compilerproc.} = + if align <= MemAlign: + when compileOption("threads"): + deallocShared(p) + else: + dealloc(p) + else: + # read offset at p - 2 bytes, then deallocate (p - offset) pointer + let offset = cast[ptr uint16](p -! sizeof(uint16))[] + when compileOption("threads"): + deallocShared(p -! offset) + else: + dealloc(p -! offset) + + proc alignedRealloc(p: pointer, oldSize, newSize, align: Natural): pointer = + if align <= MemAlign: + when compileOption("threads"): + result = reallocShared(p, newSize) + else: + result = realloc(p, newSize) + else: + result = alignedAlloc(newSize, align) + copyMem(result, p, oldSize) + alignedDealloc(p, align) + + proc alignedRealloc0(p: pointer, oldSize, newSize, align: Natural): pointer = + if align <= MemAlign: + when compileOption("threads"): + result = reallocShared0(p, oldSize, newSize) + else: + result = realloc0(p, oldSize, newSize) + else: + result = alignedAlloc(newSize, align) + copyMem(result, p, oldSize) + zeroMem(result +! oldSize, newSize - oldSize) + alignedDealloc(p, align) + {.pop.} # GC interface: diff --git a/lib/system/orc.nim b/lib/system/orc.nim index 3c2327fd5..28f8e5808 100644 --- a/lib/system/orc.nim +++ b/lib/system/orc.nim @@ -88,7 +88,7 @@ proc free(s: Cell; desc: PNimTypeV2) {.inline.} = else: cstderr.rawWrite "has dispose!\n" - nimRawDispose(p) + nimRawDispose(p, desc.align) proc nimTraceRef(q: pointer; desc: PNimTypeV2; env: pointer) {.compilerRtl, inline.} = let p = cast[ptr pointer](q) diff --git a/lib/system/seqs_v2.nim b/lib/system/seqs_v2.nim index d83a0009a..b7f24ecd5 100644 --- a/lib/system/seqs_v2.nim +++ b/lib/system/seqs_v2.nim @@ -35,10 +35,7 @@ proc newSeqPayload(cap, elemSize, elemAlign: int): pointer {.compilerRtl, raises # we have to use type erasure here as Nim does not support generic # compilerProcs. Oh well, this will all be inlined anyway. if cap > 0: - when compileOption("threads"): - var p = cast[ptr NimSeqPayloadBase](allocShared0(align(sizeof(NimSeqPayloadBase), elemAlign) + cap * elemSize)) - else: - var p = cast[ptr NimSeqPayloadBase](alloc0(align(sizeof(NimSeqPayloadBase), elemAlign) + cap * elemSize)) + var p = cast[ptr NimSeqPayloadBase](alignedAlloc0(align(sizeof(NimSeqPayloadBase), elemAlign) + cap * elemSize, elemAlign)) p.cap = cap result = p else: @@ -65,20 +62,14 @@ proc prepareSeqAdd(len: int; p: pointer; addlen, elemSize, elemAlign: int): poin let oldCap = p.cap and not strlitFlag let newCap = max(resize(oldCap), len+addlen) if (p.cap and strlitFlag) == strlitFlag: - when compileOption("threads"): - var q = cast[ptr NimSeqPayloadBase](allocShared0(headerSize + elemSize * newCap)) - else: - var q = cast[ptr NimSeqPayloadBase](alloc0(headerSize + elemSize * newCap)) + var q = cast[ptr NimSeqPayloadBase](alignedAlloc0(headerSize + elemSize * newCap, elemAlign)) copyMem(q +! headerSize, p +! headerSize, len * elemSize) q.cap = newCap result = q else: let oldSize = headerSize + elemSize * oldCap let newSize = headerSize + elemSize * newCap - when compileOption("threads"): - var q = cast[ptr NimSeqPayloadBase](reallocShared0(p, oldSize, newSize)) - else: - var q = cast[ptr NimSeqPayloadBase](realloc0(p, oldSize, newSize)) + var q = cast[ptr NimSeqPayloadBase](alignedRealloc0(p, oldSize, newSize, elemAlign)) q.cap = newCap result = q diff --git a/tests/arc/thard_alignment.nim b/tests/arc/thard_alignment.nim new file mode 100644 index 000000000..e644572f0 --- /dev/null +++ b/tests/arc/thard_alignment.nim @@ -0,0 +1,146 @@ +discard """ +disabled: "arm64" +cmd: "nim c --gc:arc $file" +output: "y" +""" + +{.passC: "-march=native".} + +proc isAlignedCheck(p: pointer, alignment: int) = + doAssert (cast[uint](p) and uint(alignment - 1)) == 0 + +proc isAlignedCheck[T](p: ref T, alignment: int) = + isAlignedCheck(cast[pointer](p), alignment) + +type + m256d {.importc: "__m256d", header: "immintrin.h".} = object + +proc set1(x: float): m256d {.importc: "_mm256_set1_pd", header: "immintrin.h".} +func `+`(a,b: m256d): m256d {.importc: "_mm256_add_pd", header: "immintrin.h".} +proc `$`(a: m256d): string = + result = $(cast[ptr float](a.unsafeAddr)[]) + + +var res: seq[seq[m256d]] + +for _ in 1..1000: + var x = newSeq[m256d](1) + x[0] = set1(1.0) # test if operation causes segfault + isAlignedCheck(x[0].addr, alignof(m256d)) + res.add x + +var res2: seq[m256d] +for i in 1..10000: + res2.setLen(res2.len + 1) # check if realloc works + isAlignedCheck(res2[0].addr, alignof(m256d)) + +proc lambdaGen(a, b: float, z: ref m256d) : auto = + var x1 = new(m256d) + var x2 = new(m256d) + isAlignedCheck(x1, alignof(m256d)) + isAlignedCheck(x2, alignof(m256d)) + x1[] = set1(2.0 + a) + x2[] = set1(-23.0 - b) + let capturingLambda = proc(x: ref m256d): ref m256d = + var cc = new(m256d) + var bb = new(m256d) + isAlignedCheck(x1, alignof(m256d)) + isAlignedCheck(x2, alignof(m256d)) + isAlignedCheck(cc, alignof(m256d)) + isAlignedCheck(bb, alignof(m256d)) + isAlignedCheck(z, alignof(m256d)) + + cc[] = x1[] + x1[] + z[] + bb[] = x2[] + set1(12.5) + z[] + + result = new(m256d) + isAlignedCheck(result, alignof(m256d)) + result[] = cc[] + bb[] + x[] + + return capturingLambda + +var xx = new(m256d) +xx[] = set1(10) +isAlignedCheck(xx, alignOf(m256d)) + +let f1 = lambdaGen(2.0 , 2.221, xx) +let f2 = lambdaGen(-1.226 , 3.5, xx) +isAlignedCheck(f1(xx), alignOf(m256d)) +isAlignedCheck(f2(xx), alignOf(m256d)) + + +#----------------------------------------------------------------------------- + +type + MyAligned = object of RootObj + a{.align: 128.}: float + + +var f: MyAligned +isAlignedCheck(f.addr, MyAligned.alignOf) + +var fref = new(MyAligned) +isAlignedCheck(fref, MyAligned.alignOf) + +var fs: seq[MyAligned] +var fr: seq[RootRef] + +for i in 0..1000: + fs.add MyAligned() + isAlignedCheck(fs[^1].addr, MyAligned.alignOf) + fs[^1].a = i.float + + fr.add new(MyAligned) + isAlignedCheck(fr[^1], MyAligned.alignOf) + ((ref MyAligned)fr[^1])[].a = i.float + +for i in 0..1000: + doAssert(fs[i].a == i.float) + doAssert(((ref MyAligned)fr[i]).a == i.float) + + +proc lambdaTest2(a: MyAligned, z: ref MyAligned): auto = + var x1: MyAligned + x1.a = a.a + z.a + var x2: MyAligned + x2.a = a.a - z.a + let capturingLambda = proc(x: MyAligned): MyAligned = + var cc: MyAligned + var bb: MyAligned + isAlignedCheck(x1.addr, MyAligned.alignOf) + isAlignedCheck(x2.addr, MyAligned.alignOf) + isAlignedCheck(cc.addr, MyAligned.alignOf) + isAlignedCheck(bb.addr, MyAligned.alignOf) + isAlignedCheck(z, MyAligned.alignOf) + + cc.a = x1.a + x1.a + z.a + bb.a = x2.a - z.a + + isAlignedCheck(result.addr, MyAligned.alignOf) + result.a = cc.a + bb.a + x2.a + + return capturingLambda + + +let q1 = lambdaTest2(MyAligned(a: 1.0), (ref MyAligned)(a: 2.0)) +let q2 = lambdaTest2(MyAligned( a: -1.0), (ref MyAligned)(a: -2.0)) + +isAlignedCheck(rawEnv(q1), MyAligned.alignOf) +isAlignedCheck(rawEnv(q2), MyAligned.alignOf) +discard q1(MyAligned(a: 1.0)) +discard q2(MyAligned(a: -1.0)) + + +#----------------------------------------------------------------------------- + +block: + var s: seq[seq[MyAligned]] + for len in 0..128: + s.add newSeq[MyAligned](len) + for i in 0..<len: + s[^1][i] = MyAligned(a: 1.0) + + if len > 0: + isAlignedCheck(s[^1][0].addr, MyAligned.alignOf) + +echo "y" diff --git a/tests/collections/thardalignmentconstraint.nim b/tests/collections/thardalignmentconstraint.nim deleted file mode 100644 index e3a3081b9..000000000 --- a/tests/collections/thardalignmentconstraint.nim +++ /dev/null @@ -1,17 +0,0 @@ -discard """ -disabled: true -""" - -# does not yet work - -{.passC: "-march=native".} - -type - m256d {.importc: "__m256d", header: "immintrin.h".} = object - -proc set1(x: float): m256d {.importc: "_mm256_set1_pd", header: "immintrin.h".} - -for _ in 1..1000: - var x = newSeq[m256d](1) - x[0] = set1(1.0) # test if operation causes segfault - doAssert (cast[uint](x[0].addr) and 31) == 0 |