summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorAraq <rumpf_a@web.de>2012-09-15 00:21:10 +0200
committerAraq <rumpf_a@web.de>2012-09-15 00:21:10 +0200
commit214c7a2ebdd15b623dff7e537007c99e66f256cc (patch)
treea37304594d96e841d96be39bf53fd89969a5ac66
parent4a435a8fb4551b03bdcbbbd9b074a51fa46928dd (diff)
downloadNim-214c7a2ebdd15b623dff7e537007c99e66f256cc.tar.gz
implemented a stack trace profiler
-rwxr-xr-xcompiler/ccgstmts.nim4
-rwxr-xr-xcompiler/cgen.nim19
-rw-r--r--compiler/cgendata.nim1
-rwxr-xr-xcompiler/nimrod.nim4
-rwxr-xr-xcompiler/ropes.nim1
-rw-r--r--lib/pure/nimprof.nim125
-rwxr-xr-xlib/system.nim3
-rwxr-xr-xlib/system/alloc.nim2
-rwxr-xr-xlib/system/gc.nim10
-rwxr-xr-xlib/system/profiler.nim140
-rwxr-xr-xtodo.txt3
11 files changed, 244 insertions, 68 deletions
diff --git a/compiler/ccgstmts.nim b/compiler/ccgstmts.nim
index dfa624821..9c25a44ec 100755
--- a/compiler/ccgstmts.nim
+++ b/compiler/ccgstmts.nim
@@ -265,6 +265,10 @@ proc genWhileStmt(p: BProc, t: PNode) =
       let label = assignLabel(p.blocks[p.breakIdx])
       lineF(p, cpsStmts, "if (!$1) goto $2;$n", [rdLoc(a), label])
     genStmts(p, t.sons[1])
+
+    if optProfiler in p.options:
+      # invoke at loop body exit:
+      lineCg(p, cpsStmts, "#nimProfile();$n")
     endBlock(p)
 
   dec(p.withinLoop)
diff --git a/compiler/cgen.nim b/compiler/cgen.nim
index 840b80ade..07ab15b33 100755
--- a/compiler/cgen.nim
+++ b/compiler/cgen.nim
@@ -680,27 +680,14 @@ proc genProcAux(m: BModule, prc: PSym) =
       app(generatedProc, initFrame(p, procname, filename))
     else: 
       app(generatedProc, p.s(cpsLocals))
-    if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM): 
-      if gProcProfile >= 64 * 1024: 
-        InternalError(prc.info, "too many procedures for profiling")
-      discard cgsym(m, "profileData")
-      appf(p.s(cpsLocals), "\tticks NIM_profilingStart;$n")
-      if prc.loc.a < 0: 
-        appf(m.s[cfsDebugInit], "\tprofileData[$1].procname = $2;$n", [
-            toRope(gProcProfile), 
-            makeCString(prc.name.s)])
-        prc.loc.a = gProcProfile
-        inc(gProcProfile)
-      prepend(p.s(cpsInit), ropef("\tNIM_profilingStart = getticks();$n"))
+    if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM):
+      # invoke at proc entry for recursion:
+      appcg(p, cpsInit, "\t#nimProfile();$n", [])
     app(generatedProc, p.s(cpsInit))
     app(generatedProc, p.s(cpsStmts))
     if p.beforeRetNeeded: appf(generatedProc, "\tBeforeRet: ;$n")
     app(generatedProc, deinitGCFrame(p))
     if optStackTrace in prc.options: app(generatedProc, deinitFrame(p))
-    if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM): 
-      appf(generatedProc, 
-        "\tprofileData[$1].total += elapsed(getticks(), NIM_profilingStart);$n", 
-        [toRope(prc.loc.a)])
     app(generatedProc, returnStmt)
     appf(generatedProc, "}$N")
   app(m.s[cfsProcs], generatedProc)
diff --git a/compiler/cgendata.nim b/compiler/cgendata.nim
index f42381e59..57911c004 100644
--- a/compiler/cgendata.nim
+++ b/compiler/cgendata.nim
@@ -115,7 +115,6 @@ type
 var
   mainModProcs*, mainModInit*, mainDatInit*: PRope # parts of the main module
   gMapping*: PRope             # the generated mapping file (if requested)
-  gProcProfile*: Natural       # proc profile counter
   gPendingModules*: seq[BModule] = @[] # list of modules that are not
                                        # finished with code generation
   gModules*: seq[BModule] = @[] # list of all compiled modules
diff --git a/compiler/nimrod.nim b/compiler/nimrod.nim
index 1bd6698a7..0305396da 100755
--- a/compiler/nimrod.nim
+++ b/compiler/nimrod.nim
@@ -20,6 +20,10 @@ import
 when hasTinyCBackend:
   import tccgen
 
+when defined(profiler):
+  {.hint: "Profiling support is turned on!".}
+  import nimprof
+
 var 
   arguments: string = ""      # the arguments to be passed to the program that
                               # should be run
diff --git a/compiler/ropes.nim b/compiler/ropes.nim
index 0107f5241..50c89e4d9 100755
--- a/compiler/ropes.nim
+++ b/compiler/ropes.nim
@@ -294,6 +294,7 @@ proc crcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 =
     result = crcFromRopeAux(r.right, result)
 
 proc newCrcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 = 
+  # XXX profiling shows this is actually expensive
   var stack: TRopeSeq = @[r]
   result = startVal
   while len(stack) > 0: 
diff --git a/lib/pure/nimprof.nim b/lib/pure/nimprof.nim
new file mode 100644
index 000000000..99af71f90
--- /dev/null
+++ b/lib/pure/nimprof.nim
@@ -0,0 +1,125 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2012 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Profiling support for Nimrod. This is an embedded profiler that requires
+## ``--profiler:on``. You only need to import this module to get a profiling
+## report at program exit.
+
+when not defined(profiler):
+  {.error: "Profiling support is turned off!".}
+
+# We don't want to profile the profiling code ...
+{.push profiler: off.}
+
+import hashes, algorithm, strutils
+
+# We use a simple hash table of bounded size to keep track of the stack traces:
+type
+  TProfileEntry = object
+    total: int
+    st: system.TStackTrace
+  TProfileData = array [0..64*1024-1, ref TProfileEntry]
+
+proc `==`(a, b: TStackTrace): bool =
+  for i in 0 .. high(a):
+    if a[i] != b[i]: return false
+  result = true
+
+# XXX extract this data structure; it is generally useful ;-)
+var
+  profileData: TProfileData
+  emptySlots = profileData.len * 3 div 2
+  maxChainLen = 0
+  totalCalls = 0
+
+proc hook(st: system.TStackTrace) {.nimcall.} =
+  # this is quite performance sensitive!
+  inc totalCalls
+  var last = high(st)
+  while last > 0 and isNil(st[last]): dec last
+  var h = hash(pointer(st[last])) and high(profileData)
+  
+  # we use probing for maxChainLen entries and replace the encountered entry
+  # with the minimal 'total' value:
+  if emptySlots == 0:
+    var minIdx = h
+    var probes = maxChainLen
+    while probes >= 0:
+      if profileData[h].st == st:
+        # wow, same entry found:
+        inc profileData[h].total
+        return
+      if profileData[minIdx].total < profileData[h].total:
+        minIdx = h
+      h = ((5 * h) + 1) and high(profileData)
+      dec probes
+    profileData[minIdx].total = 1
+    profileData[minIdx].st = st
+  else:
+    var chain = 0
+    while true:
+      if profileData[h] == nil:
+        GC_disable()
+        new profileData[h]
+        GC_enable()
+        profileData[h].total = 1
+        profileData[h].st = st
+        dec emptySlots
+        maxChainLen = max(maxChainLen, chain)
+        return
+      if profileData[h].st == st:
+        # wow, same entry found:
+        inc profileData[h].total
+        maxChainLen = max(maxChainLen, chain)
+        return
+      h = ((5 * h) + 1) and high(profileData)
+      inc chain
+
+proc getTotal(x: ref TProfileEntry): int =
+  result = if isNil(x): 0 else: x.total
+
+proc cmpEntries(a, b: ref TProfileEntry): int =
+  result = b.getTotal - a.getTotal
+
+proc writeProfile() {.noconv.} =
+  stopProfiling()
+  const filename = "profile_results"
+  var f: TFile
+  var j = 1
+  while open(f, filename & $j & ".txt"):
+    close(f)
+    inc(j)
+  let filen = filename & $j & ".txt"
+  echo "writing ", filen, "..."
+  if open(f, filen, fmWrite):
+    sort(profileData, cmpEntries)
+    writeln(f, "total executions of each stack trace:")
+    var entries = 0
+    for i in 0..high(profileData):
+      if profileData[i] != nil: inc entries
+    
+    var sum = 0
+    # only write the first 100 entries:
+    for i in 0..min(100, high(profileData)):
+      if profileData[i] != nil and profileData[i].total > 1:
+        inc sum, profileData[i].total
+        writeln(f, "Entry: ", i, "/", entries, " Calls: ",
+          profileData[i].total, "/", totalCalls, " [sum: ", sum, "; ",
+          formatFloat(sum / totalCalls * 100.0, ffDefault, 2), "%]")
+        for ii in 0..high(TStackTrace):
+          let procname = profileData[i].st[ii]
+          if isNil(procname): break
+          writeln(f, "  ", procname)
+    close(f)
+  echo "... done"
+
+system.profilerHook = hook
+addQuitProc(writeProfile)
+
+{.pop.}
diff --git a/lib/system.nim b/lib/system.nim
index 911b134e2..7b129bf46 100755
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -829,7 +829,8 @@ when taintMode:
 else:
   type TaintedString* = string
 
-
+when defined(profiler):
+  proc nimProfile() {.compilerProc, noinline.}
 when hasThreadSupport:
   {.pragma: rtlThreadVar, threadvar.}
 else:
diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim
index 5182d5ccf..82755ca86 100755
--- a/lib/system/alloc.nim
+++ b/lib/system/alloc.nim
@@ -11,6 +11,7 @@
 # TODO: 
 # - eliminate "used" field
 # - make searching for block O(1)
+{.push profiler:off.}
 
 # ------------ platform specific chunk allocation code -----------------------
 
@@ -794,3 +795,4 @@ template InstantiateForRegion(allocator: expr) =
     else:
       result = realloc(p, newsize)
 
+{.pop.}
diff --git a/lib/system/gc.nim b/lib/system/gc.nim
index 470064c5f..a5937efe5 100755
--- a/lib/system/gc.nim
+++ b/lib/system/gc.nim
@@ -7,7 +7,6 @@
 #    distribution, for details about the copyright.
 #
 
-
 #            Garbage Collector
 #
 # The basic algorithm is *Deferrent Reference Counting* with cycle detection.
@@ -17,6 +16,7 @@
 # Special care has been taken to avoid recursion as far as possible to avoid
 # stack overflows when traversing deep datastructures. It is well-suited
 # for soft real time applications (like games).
+{.push profiler:off.}
 
 const
   CycleIncrease = 2 # is a multiplicative increase
@@ -26,7 +26,7 @@ const
                       # this seems to be a good value
   withRealTime = defined(useRealtimeGC)
 
-when withRealTime:
+when withRealTime and not defined(getTicks):
   include "system/timers"
 
 const
@@ -426,6 +426,8 @@ proc rawNewObj(typ: PNimType, size: int, gch: var TGcHeap): pointer =
   result = cellToUsr(res)
   sysAssert(allocInv(gch.region), "rawNewObj end")
 
+{.pop.}
+
 proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
   result = rawNewObj(typ, size, gch)
   zeroMem(result, size)
@@ -514,6 +516,8 @@ proc growObj(old: pointer, newsize: int, gch: var TGcHeap): pointer =
 proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
   result = growObj(old, newsize, gch)
 
+{.push profiler:off.}
+
 # ---------------- cycle collector -------------------------------------------
 
 proc doOperation(p: pointer, op: TWalkOp) =
@@ -917,3 +921,5 @@ when not defined(useNimRtl):
              "[GC] max pause time [ms]: " & $(gch.stat.maxPause div 1000_000)
     when traceGC: writeLeakage()
     GC_enable()
+
+{.pop.}
diff --git a/lib/system/profiler.nim b/lib/system/profiler.nim
index 295cb43c7..aef75d39d 100755
--- a/lib/system/profiler.nim
+++ b/lib/system/profiler.nim
@@ -8,54 +8,100 @@
 #
 
 # This file implements the Nimrod profiler. The profiler needs support by the
-# code generator. 
+# code generator. The idea is to inject the instruction stream
+# with 'nimProfile()' calls. These calls are injected at every loop end
+# (except perhaps loops that have no side-effects). At every Nth call a
+# stack trace is taken. A stack tace is a list of cstrings. We have a count
+# table of those.
+# 
+# The nice thing about this profiler is that it's completely time independent!
+
+{.push profiler: off.}
+
+when not defined(getTicks):
+  include "system/timers"
+
+const
+  MaxTraceLen = 20 # tracking the last 20 calls is enough
 
 type
-  TProfileData {.compilerproc, final.} = object
-    procname: cstring
-    total: float
+  TStackTrace* = array [0..MaxTraceLen-1, cstring]
+  TProfilerHook* = proc (st: TStackTrace) {.nimcall.}
+
+proc captureStackTrace(f: PFrame, st: var TStackTrace) =
+  const
+    firstCalls = 5
+  var
+    it = f
+    i = 0
+    total = 0
+  while it != nil and i <= high(st)-(firstCalls-1):
+    # the (-1) is for a nil entry that marks where the '...' should occur
+    st[i] = it.procname
+    inc(i)
+    inc(total)
+    it = it.prev
+  var b = it
+  while it != nil:
+    inc(total)
+    it = it.prev
+  for j in 1..total-i-(firstCalls-1): 
+    if b != nil: b = b.prev
+  if total != i:
+    st[i] = "..."
+    inc(i)
+  while b != nil and i <= high(st):
+    st[i] = b.procname
+    inc(i)
+    b = b.prev
 
 var
-  profileData {.compilerproc.}: array [0..64*1024-1, TProfileData]
-
-proc sortProfile(a: var array[0..64*1024-1, TProfileData], N: int) = 
-  # we use shellsort here; fast enough and simple
-  var h = 1
-  while true: 
-    h = 3 * h + 1
-    if h > N: break 
-  while true: 
-    h = h div 3
-    for i in countup(h, N - 1): 
-      var v = a[i]
-      var j = i
-      while a[j-h].total <= v.total: 
-        a[j] = a[j-h]
-        j = j-h
-        if j < h: break 
-      a[j] = v
-    if h == 1: break
-
-proc writeProfile() {.noconv.} =
-  const filename = "profile_results"
-  var i = 0
-  var f: TFile
-  var j = 1
-  while open(f, filename & $j & ".txt"):
-    close(f)
-    inc(j)
-  if open(f, filename & $j & ".txt", fmWrite):
-    var N = 0
-    # we have to compute the actual length of the array:
-    while profileData[N].procname != nil: inc(N)
-    sortProfile(profileData, N)
-    writeln(f, "total running time of each proc" &
-               " (interpret these numbers relatively)")
-    while profileData[i].procname != nil:
-      write(f, profileData[i].procname)
-      write(f, ": ")
-      writeln(f, profileData[i].total)
-      inc(i)
-    close(f)
-
-addQuitProc(writeProfile)
+  profilerHook*: TProfilerHook
+    ## set this variable to provide a procedure that implements a profiler in
+    ## user space. See the `nimprof` module for a reference implementation.
+  SamplingInterval = 50_000
+    # set this to change the default sampling interval
+  gTicker = SamplingInterval
+  interval: TNanos = 5_000_000 # 5ms
+
+proc callProfilerHook(hook: TProfilerHook) {.noinline.} =
+  # 'noinline' so that 'nimProfile' does not perform the stack allocation
+  # in the common case.
+  var st: TStackTrace
+  captureStackTrace(framePtr, st)
+  hook(st)
+
+proc setProfilingInterval*(intervalInUs: int): TNanos =
+  ## set this to change the sampling interval. Default value is 5ms.
+  interval = intervalInUs * 1000
+
+var t0: TTicks
+
+var usefulCall* = 0
+var uselessCall* = 0
+
+proc nimProfile() =
+  ## This is invoked by the compiler in every loop and on every proc entry!
+  dec gTicker
+  if gTicker == 0:
+    gTicker = -1
+    let t1 = getticks()
+    if getticks() - t0 > interval:
+      inc usefulCall
+      if not isNil(profilerHook):
+        # disable recursive calls: XXX should use try..finally,
+        # but that's too expensive!
+        let oldHook = profilerHook
+        profilerHook = nil
+        callProfilerHook(oldHook)
+        profilerHook = oldHook
+      t0 = getticks()
+    else:
+      inc uselessCall
+    gTicker = SamplingInterval
+
+proc stopProfiling*() =
+  ## call this to stop profiling; should be called by the client profiler.
+  profilerHook = nil
+
+{.pop.}
diff --git a/todo.txt b/todo.txt
index 7ed287284..91cd02429 100755
--- a/todo.txt
+++ b/todo.txt
@@ -1,7 +1,6 @@
 version 0.9.0
 =============
 
-- make 'm: stmt' use overloading resolution
 - make 'bind' default for templates and introduce 'mixin'
 
 - implicit deref for parameter matching
@@ -22,6 +21,7 @@ Bugs
 - bug: the parser is not strict enough with newlines: 'echo "a" echo "b"' 
   compiles
 - bug: blocks can "export" an identifier but the CCG generates {} for them ...
+- bug: what if we pass an iterator to a proc var? iterators use tyProc too ...
 
 
 version 0.9.XX
@@ -38,6 +38,7 @@ version 0.9.XX
     echo a
     echo b)
 
+- make 'm: stmt' use overloading resolution
 - implement "closure tuple consists of a single 'ref'" optimization
 - implement for loop transformation for first class iterators
 - JS gen: