From 214c7a2ebdd15b623dff7e537007c99e66f256cc Mon Sep 17 00:00:00 2001 From: Araq Date: Sat, 15 Sep 2012 00:21:10 +0200 Subject: implemented a stack trace profiler --- compiler/ccgstmts.nim | 4 ++ compiler/cgen.nim | 19 ++----- compiler/cgendata.nim | 1 - compiler/nimrod.nim | 4 ++ compiler/ropes.nim | 1 + lib/pure/nimprof.nim | 125 ++++++++++++++++++++++++++++++++++++++++++ lib/system.nim | 3 +- lib/system/alloc.nim | 2 + lib/system/gc.nim | 10 +++- lib/system/profiler.nim | 140 ++++++++++++++++++++++++++++++++---------------- todo.txt | 3 +- 11 files changed, 244 insertions(+), 68 deletions(-) create mode 100644 lib/pure/nimprof.nim diff --git a/compiler/ccgstmts.nim b/compiler/ccgstmts.nim index dfa624821..9c25a44ec 100755 --- a/compiler/ccgstmts.nim +++ b/compiler/ccgstmts.nim @@ -265,6 +265,10 @@ proc genWhileStmt(p: BProc, t: PNode) = let label = assignLabel(p.blocks[p.breakIdx]) lineF(p, cpsStmts, "if (!$1) goto $2;$n", [rdLoc(a), label]) genStmts(p, t.sons[1]) + + if optProfiler in p.options: + # invoke at loop body exit: + lineCg(p, cpsStmts, "#nimProfile();$n") endBlock(p) dec(p.withinLoop) diff --git a/compiler/cgen.nim b/compiler/cgen.nim index 840b80ade..07ab15b33 100755 --- a/compiler/cgen.nim +++ b/compiler/cgen.nim @@ -680,27 +680,14 @@ proc genProcAux(m: BModule, prc: PSym) = app(generatedProc, initFrame(p, procname, filename)) else: app(generatedProc, p.s(cpsLocals)) - if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM): - if gProcProfile >= 64 * 1024: - InternalError(prc.info, "too many procedures for profiling") - discard cgsym(m, "profileData") - appf(p.s(cpsLocals), "\tticks NIM_profilingStart;$n") - if prc.loc.a < 0: - appf(m.s[cfsDebugInit], "\tprofileData[$1].procname = $2;$n", [ - toRope(gProcProfile), - makeCString(prc.name.s)]) - prc.loc.a = gProcProfile - inc(gProcProfile) - prepend(p.s(cpsInit), ropef("\tNIM_profilingStart = getticks();$n")) + if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM): + # invoke at proc entry for recursion: + appcg(p, cpsInit, "\t#nimProfile();$n", []) app(generatedProc, p.s(cpsInit)) app(generatedProc, p.s(cpsStmts)) if p.beforeRetNeeded: appf(generatedProc, "\tBeforeRet: ;$n") app(generatedProc, deinitGCFrame(p)) if optStackTrace in prc.options: app(generatedProc, deinitFrame(p)) - if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM): - appf(generatedProc, - "\tprofileData[$1].total += elapsed(getticks(), NIM_profilingStart);$n", - [toRope(prc.loc.a)]) app(generatedProc, returnStmt) appf(generatedProc, "}$N") app(m.s[cfsProcs], generatedProc) diff --git a/compiler/cgendata.nim b/compiler/cgendata.nim index f42381e59..57911c004 100644 --- a/compiler/cgendata.nim +++ b/compiler/cgendata.nim @@ -115,7 +115,6 @@ type var mainModProcs*, mainModInit*, mainDatInit*: PRope # parts of the main module gMapping*: PRope # the generated mapping file (if requested) - gProcProfile*: Natural # proc profile counter gPendingModules*: seq[BModule] = @[] # list of modules that are not # finished with code generation gModules*: seq[BModule] = @[] # list of all compiled modules diff --git a/compiler/nimrod.nim b/compiler/nimrod.nim index 1bd6698a7..0305396da 100755 --- a/compiler/nimrod.nim +++ b/compiler/nimrod.nim @@ -20,6 +20,10 @@ import when hasTinyCBackend: import tccgen +when defined(profiler): + {.hint: "Profiling support is turned on!".} + import nimprof + var arguments: string = "" # the arguments to be passed to the program that # should be run diff --git a/compiler/ropes.nim b/compiler/ropes.nim index 0107f5241..50c89e4d9 100755 --- a/compiler/ropes.nim +++ b/compiler/ropes.nim @@ -294,6 +294,7 @@ proc crcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 = result = crcFromRopeAux(r.right, result) proc newCrcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 = + # XXX profiling shows this is actually expensive var stack: TRopeSeq = @[r] result = startVal while len(stack) > 0: diff --git a/lib/pure/nimprof.nim b/lib/pure/nimprof.nim new file mode 100644 index 000000000..99af71f90 --- /dev/null +++ b/lib/pure/nimprof.nim @@ -0,0 +1,125 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2012 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## Profiling support for Nimrod. This is an embedded profiler that requires +## ``--profiler:on``. You only need to import this module to get a profiling +## report at program exit. + +when not defined(profiler): + {.error: "Profiling support is turned off!".} + +# We don't want to profile the profiling code ... +{.push profiler: off.} + +import hashes, algorithm, strutils + +# We use a simple hash table of bounded size to keep track of the stack traces: +type + TProfileEntry = object + total: int + st: system.TStackTrace + TProfileData = array [0..64*1024-1, ref TProfileEntry] + +proc `==`(a, b: TStackTrace): bool = + for i in 0 .. high(a): + if a[i] != b[i]: return false + result = true + +# XXX extract this data structure; it is generally useful ;-) +var + profileData: TProfileData + emptySlots = profileData.len * 3 div 2 + maxChainLen = 0 + totalCalls = 0 + +proc hook(st: system.TStackTrace) {.nimcall.} = + # this is quite performance sensitive! + inc totalCalls + var last = high(st) + while last > 0 and isNil(st[last]): dec last + var h = hash(pointer(st[last])) and high(profileData) + + # we use probing for maxChainLen entries and replace the encountered entry + # with the minimal 'total' value: + if emptySlots == 0: + var minIdx = h + var probes = maxChainLen + while probes >= 0: + if profileData[h].st == st: + # wow, same entry found: + inc profileData[h].total + return + if profileData[minIdx].total < profileData[h].total: + minIdx = h + h = ((5 * h) + 1) and high(profileData) + dec probes + profileData[minIdx].total = 1 + profileData[minIdx].st = st + else: + var chain = 0 + while true: + if profileData[h] == nil: + GC_disable() + new profileData[h] + GC_enable() + profileData[h].total = 1 + profileData[h].st = st + dec emptySlots + maxChainLen = max(maxChainLen, chain) + return + if profileData[h].st == st: + # wow, same entry found: + inc profileData[h].total + maxChainLen = max(maxChainLen, chain) + return + h = ((5 * h) + 1) and high(profileData) + inc chain + +proc getTotal(x: ref TProfileEntry): int = + result = if isNil(x): 0 else: x.total + +proc cmpEntries(a, b: ref TProfileEntry): int = + result = b.getTotal - a.getTotal + +proc writeProfile() {.noconv.} = + stopProfiling() + const filename = "profile_results" + var f: TFile + var j = 1 + while open(f, filename & $j & ".txt"): + close(f) + inc(j) + let filen = filename & $j & ".txt" + echo "writing ", filen, "..." + if open(f, filen, fmWrite): + sort(profileData, cmpEntries) + writeln(f, "total executions of each stack trace:") + var entries = 0 + for i in 0..high(profileData): + if profileData[i] != nil: inc entries + + var sum = 0 + # only write the first 100 entries: + for i in 0..min(100, high(profileData)): + if profileData[i] != nil and profileData[i].total > 1: + inc sum, profileData[i].total + writeln(f, "Entry: ", i, "/", entries, " Calls: ", + profileData[i].total, "/", totalCalls, " [sum: ", sum, "; ", + formatFloat(sum / totalCalls * 100.0, ffDefault, 2), "%]") + for ii in 0..high(TStackTrace): + let procname = profileData[i].st[ii] + if isNil(procname): break + writeln(f, " ", procname) + close(f) + echo "... done" + +system.profilerHook = hook +addQuitProc(writeProfile) + +{.pop.} diff --git a/lib/system.nim b/lib/system.nim index 911b134e2..7b129bf46 100755 --- a/lib/system.nim +++ b/lib/system.nim @@ -829,7 +829,8 @@ when taintMode: else: type TaintedString* = string - +when defined(profiler): + proc nimProfile() {.compilerProc, noinline.} when hasThreadSupport: {.pragma: rtlThreadVar, threadvar.} else: diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim index 5182d5ccf..82755ca86 100755 --- a/lib/system/alloc.nim +++ b/lib/system/alloc.nim @@ -11,6 +11,7 @@ # TODO: # - eliminate "used" field # - make searching for block O(1) +{.push profiler:off.} # ------------ platform specific chunk allocation code ----------------------- @@ -794,3 +795,4 @@ template InstantiateForRegion(allocator: expr) = else: result = realloc(p, newsize) +{.pop.} diff --git a/lib/system/gc.nim b/lib/system/gc.nim index 470064c5f..a5937efe5 100755 --- a/lib/system/gc.nim +++ b/lib/system/gc.nim @@ -7,7 +7,6 @@ # distribution, for details about the copyright. # - # Garbage Collector # # The basic algorithm is *Deferrent Reference Counting* with cycle detection. @@ -17,6 +16,7 @@ # Special care has been taken to avoid recursion as far as possible to avoid # stack overflows when traversing deep datastructures. It is well-suited # for soft real time applications (like games). +{.push profiler:off.} const CycleIncrease = 2 # is a multiplicative increase @@ -26,7 +26,7 @@ const # this seems to be a good value withRealTime = defined(useRealtimeGC) -when withRealTime: +when withRealTime and not defined(getTicks): include "system/timers" const @@ -426,6 +426,8 @@ proc rawNewObj(typ: PNimType, size: int, gch: var TGcHeap): pointer = result = cellToUsr(res) sysAssert(allocInv(gch.region), "rawNewObj end") +{.pop.} + proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} = result = rawNewObj(typ, size, gch) zeroMem(result, size) @@ -514,6 +516,8 @@ proc growObj(old: pointer, newsize: int, gch: var TGcHeap): pointer = proc growObj(old: pointer, newsize: int): pointer {.rtl.} = result = growObj(old, newsize, gch) +{.push profiler:off.} + # ---------------- cycle collector ------------------------------------------- proc doOperation(p: pointer, op: TWalkOp) = @@ -917,3 +921,5 @@ when not defined(useNimRtl): "[GC] max pause time [ms]: " & $(gch.stat.maxPause div 1000_000) when traceGC: writeLeakage() GC_enable() + +{.pop.} diff --git a/lib/system/profiler.nim b/lib/system/profiler.nim index 295cb43c7..aef75d39d 100755 --- a/lib/system/profiler.nim +++ b/lib/system/profiler.nim @@ -8,54 +8,100 @@ # # This file implements the Nimrod profiler. The profiler needs support by the -# code generator. +# code generator. The idea is to inject the instruction stream +# with 'nimProfile()' calls. These calls are injected at every loop end +# (except perhaps loops that have no side-effects). At every Nth call a +# stack trace is taken. A stack tace is a list of cstrings. We have a count +# table of those. +# +# The nice thing about this profiler is that it's completely time independent! + +{.push profiler: off.} + +when not defined(getTicks): + include "system/timers" + +const + MaxTraceLen = 20 # tracking the last 20 calls is enough type - TProfileData {.compilerproc, final.} = object - procname: cstring - total: float + TStackTrace* = array [0..MaxTraceLen-1, cstring] + TProfilerHook* = proc (st: TStackTrace) {.nimcall.} + +proc captureStackTrace(f: PFrame, st: var TStackTrace) = + const + firstCalls = 5 + var + it = f + i = 0 + total = 0 + while it != nil and i <= high(st)-(firstCalls-1): + # the (-1) is for a nil entry that marks where the '...' should occur + st[i] = it.procname + inc(i) + inc(total) + it = it.prev + var b = it + while it != nil: + inc(total) + it = it.prev + for j in 1..total-i-(firstCalls-1): + if b != nil: b = b.prev + if total != i: + st[i] = "..." + inc(i) + while b != nil and i <= high(st): + st[i] = b.procname + inc(i) + b = b.prev var - profileData {.compilerproc.}: array [0..64*1024-1, TProfileData] - -proc sortProfile(a: var array[0..64*1024-1, TProfileData], N: int) = - # we use shellsort here; fast enough and simple - var h = 1 - while true: - h = 3 * h + 1 - if h > N: break - while true: - h = h div 3 - for i in countup(h, N - 1): - var v = a[i] - var j = i - while a[j-h].total <= v.total: - a[j] = a[j-h] - j = j-h - if j < h: break - a[j] = v - if h == 1: break - -proc writeProfile() {.noconv.} = - const filename = "profile_results" - var i = 0 - var f: TFile - var j = 1 - while open(f, filename & $j & ".txt"): - close(f) - inc(j) - if open(f, filename & $j & ".txt", fmWrite): - var N = 0 - # we have to compute the actual length of the array: - while profileData[N].procname != nil: inc(N) - sortProfile(profileData, N) - writeln(f, "total running time of each proc" & - " (interpret these numbers relatively)") - while profileData[i].procname != nil: - write(f, profileData[i].procname) - write(f, ": ") - writeln(f, profileData[i].total) - inc(i) - close(f) - -addQuitProc(writeProfile) + profilerHook*: TProfilerHook + ## set this variable to provide a procedure that implements a profiler in + ## user space. See the `nimprof` module for a reference implementation. + SamplingInterval = 50_000 + # set this to change the default sampling interval + gTicker = SamplingInterval + interval: TNanos = 5_000_000 # 5ms + +proc callProfilerHook(hook: TProfilerHook) {.noinline.} = + # 'noinline' so that 'nimProfile' does not perform the stack allocation + # in the common case. + var st: TStackTrace + captureStackTrace(framePtr, st) + hook(st) + +proc setProfilingInterval*(intervalInUs: int): TNanos = + ## set this to change the sampling interval. Default value is 5ms. + interval = intervalInUs * 1000 + +var t0: TTicks + +var usefulCall* = 0 +var uselessCall* = 0 + +proc nimProfile() = + ## This is invoked by the compiler in every loop and on every proc entry! + dec gTicker + if gTicker == 0: + gTicker = -1 + let t1 = getticks() + if getticks() - t0 > interval: + inc usefulCall + if not isNil(profilerHook): + # disable recursive calls: XXX should use try..finally, + # but that's too expensive! + let oldHook = profilerHook + profilerHook = nil + callProfilerHook(oldHook) + profilerHook = oldHook + t0 = getticks() + else: + inc uselessCall + gTicker = SamplingInterval + +proc stopProfiling*() = + ## call this to stop profiling; should be called by the client profiler. + profilerHook = nil + +{.pop.} diff --git a/todo.txt b/todo.txt index 7ed287284..91cd02429 100755 --- a/todo.txt +++ b/todo.txt @@ -1,7 +1,6 @@ version 0.9.0 ============= -- make 'm: stmt' use overloading resolution - make 'bind' default for templates and introduce 'mixin' - implicit deref for parameter matching @@ -22,6 +21,7 @@ Bugs - bug: the parser is not strict enough with newlines: 'echo "a" echo "b"' compiles - bug: blocks can "export" an identifier but the CCG generates {} for them ... +- bug: what if we pass an iterator to a proc var? iterators use tyProc too ... version 0.9.XX @@ -38,6 +38,7 @@ version 0.9.XX echo a echo b) +- make 'm: stmt' use overloading resolution - implement "closure tuple consists of a single 'ref'" optimization - implement for loop transformation for first class iterators - JS gen: -- cgit 1.4.1-2-gfad0