summary refs log tree commit diff stats
path: root/tests/niminaction/Chapter6/WikipediaStats/concurrency.nim
blob: 913cd77db001f4dd559bcbf740ea92ad60f578fc (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
discard """
action: compile
"""

# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites
import tables, parseutils, strutils, threadpool

const filename = "pagecounts-20160101-050000"

type
  Stats = ref object
    projectName, pageTitle: string
    requests, contentSize: int

proc `$`(stats: Stats): string =
  "(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [
    stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize
  ]

proc parse(chunk: string): Stats =
  # Each line looks like: en Main_Page 242332 4737756101
  result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)

  var projectName = ""
  var pageTitle = ""
  var requests = ""
  var contentSize = ""
  for line in chunk.splitLines:
    var i = 0
    projectName.setLen(0)
    i.inc parseUntil(line, projectName, Whitespace, i)
    i.inc skipWhitespace(line, i)
    pageTitle.setLen(0)
    i.inc parseUntil(line, pageTitle, Whitespace, i)
    i.inc skipWhitespace(line, i)
    requests.setLen(0)
    i.inc parseUntil(line, requests, Whitespace, i)
    i.inc skipWhitespace(line, i)
    contentSize.setLen(0)
    i.inc parseUntil(line, contentSize, Whitespace, i)
    i.inc skipWhitespace(line, i)

    if requests.len == 0 or contentSize.len == 0:
      # Ignore lines with either of the params that are empty.
      continue

    let requestsInt = requests.parseInt
    if requestsInt > result.requests and projectName == "en":
      result = Stats(
        projectName: projectName,
        pageTitle: pageTitle,
        requests: requestsInt,
        contentSize: contentSize.parseInt
      )

proc readChunks(filename: string, chunksize = 1000000): Stats =
  result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
  var file = open(filename)
  var responses = newSeq[FlowVar[Stats]]()
  var buffer = newString(chunksize)
  var oldBufferLen = 0
  while not endOfFile(file):
    let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen
    var chunkLen = readSize

    while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
      # Find where the last line ends
      chunkLen.dec

    responses.add(spawn parse(buffer[0 ..< chunkLen]))
    oldBufferLen = readSize - chunkLen
    buffer[0 ..< oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]

  for resp in responses:
    let statistic = ^resp
    if statistic.requests > result.requests:
      result = statistic

  file.close()


when true:
  echo readChunks(filename)