summary refs log tree commit diff stats
path: root/tests/niminaction/Chapter6/WikipediaStats/concurrency_regex.nim
blob: 8df3b6aeba7344c9c8762d84830e9e042fae42a3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites
import tables, parseutils, strutils, threadpool, re

const filename = "pagecounts-20160101-050000"

type
  Stats = ref object
    projectName, pageTitle: string
    requests, contentSize: int

proc `$`(stats: Stats): string =
  "(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [
    stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize
  ]

proc parse(chunk: string): Stats =
  # Each line looks like: en Main_Page 242332 4737756101
  result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)

  var matches: array[4, string]
  var reg = re"([^\s]+)\s([^\s]+)\s(\d+)\s(\d+)"
  for line in chunk.splitLines:

    let start = find(line, reg, matches)
    if start == -1: continue

    let requestsInt = matches[2].parseInt
    if requestsInt > result.requests and matches[0] == "en":
      result = Stats(
        projectName: matches[0],
        pageTitle: matches[1],
        requests: requestsInt,
        contentSize: matches[3].parseInt
      )

proc readChunks(filename: string, chunksize = 1000000): Stats =
  result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
  var file = open(filename)
  var responses = newSeq[FlowVar[Stats]]()
  var buffer = newString(chunksize)
  var oldBufferLen = 0
  while not endOfFile(file):
    let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen
    var chunkLen = readSize

    while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
      # Find where the last line ends
      chunkLen.dec

    responses.add(spawn parse(buffer[0 .. <chunkLen]))
    oldBufferLen = readSize - chunkLen
    buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]

  echo("Spawns: ", responses.len)
  for resp in responses:
    let statistic = ^resp
    if statistic.requests > result.requests:
      result = statistic

  file.close()


when isMainModule:
  echo readChunks(filename)