summary refs log tree commit diff stats
path: root/tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim
diff options
context:
space:
mode:
Diffstat (limited to 'tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim')
-rw-r--r--tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim72
1 files changed, 72 insertions, 0 deletions
diff --git a/tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim b/tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim
new file mode 100644
index 000000000..7181145e9
--- /dev/null
+++ b/tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim
@@ -0,0 +1,72 @@
+import os, parseutils, threadpool, strutils
+
+type
+  Stats = ref object
+    domainCode, pageTitle: string
+    countViews, totalSize: int
+
+proc newStats(): Stats =
+  Stats(domainCode: "", pageTitle: "", countViews: 0, totalSize: 0)
+
+proc `$`(stats: Stats): string =
+  "(domainCode: $#, pageTitle: $#, countViews: $#, totalSize: $#)" % [
+    stats.domainCode, stats.pageTitle, $stats.countViews, $stats.totalSize
+  ]
+
+proc parse(line: string, domainCode, pageTitle: var string,
+    countViews, totalSize: var int) =
+  if line.len == 0: return
+  var i = 0
+  domainCode.setLen(0)
+  i.inc parseUntil(line, domainCode, {' '}, i)
+  i.inc
+  pageTitle.setLen(0)
+  i.inc parseUntil(line, pageTitle, {' '}, i)
+  i.inc
+  countViews = 0
+  i.inc parseInt(line, countViews, i)
+  i.inc
+  totalSize = 0
+  i.inc parseInt(line, totalSize, i)
+
+proc parseChunk(chunk: string): Stats =
+  result = newStats()
+  var domainCode = ""
+  var pageTitle = ""
+  var countViews = 0
+  var totalSize = 0
+  for line in splitLines(chunk):
+    parse(line, domainCode, pageTitle, countViews, totalSize)
+    if domainCode == "en" and countViews > result.countViews:
+      result = Stats(domainCode: domainCode, pageTitle: pageTitle,
+                     countViews: countViews, totalSize: totalSize)
+
+proc readPageCounts(filename: string, chunkSize = 1_000_000) =
+  var file = open(filename)
+  var responses = newSeq[FlowVar[Stats]]()
+  var buffer = newString(chunksize)
+  var oldBufferLen = 0
+  while not endOfFile(file):
+    let reqSize = chunksize - oldBufferLen
+    let readSize = file.readChars(buffer, oldBufferLen, reqSize) + oldBufferLen
+    var chunkLen = readSize
+
+    while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
+      chunkLen.dec
+
+    responses.add(spawn parseChunk(buffer[0 .. <chunkLen]))
+    oldBufferLen = readSize - chunkLen
+    buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
+
+  var mostPopular = newStats()
+  for resp in responses:
+    let statistic = ^resp
+    if statistic.countViews > mostPopular.countViews:
+      mostPopular = statistic
+
+  echo("Most popular is: ", mostPopular)
+
+when isMainModule:
+  const file = "pagecounts-20160101-050000"
+  let filename = getCurrentDir() / file
+  readPageCounts(filename)
\ No newline at end of file