1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
discard """
action: compile
"""
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites
import tables, parseutils, strutils, threadpool
const filename = "pagecounts-20160101-050000"
type
Stats = ref object
projectName, pageTitle: string
requests, contentSize: int
proc `$`(stats: Stats): string =
"(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [
stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize
]
proc parse(chunk: string): Stats =
# Each line looks like: en Main_Page 242332 4737756101
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
var projectName = ""
var pageTitle = ""
var requests = ""
var contentSize = ""
for line in chunk.splitLines:
var i = 0
projectName.setLen(0)
i.inc parseUntil(line, projectName, Whitespace, i)
i.inc skipWhitespace(line, i)
pageTitle.setLen(0)
i.inc parseUntil(line, pageTitle, Whitespace, i)
i.inc skipWhitespace(line, i)
requests.setLen(0)
i.inc parseUntil(line, requests, Whitespace, i)
i.inc skipWhitespace(line, i)
contentSize.setLen(0)
i.inc parseUntil(line, contentSize, Whitespace, i)
i.inc skipWhitespace(line, i)
if requests.len == 0 or contentSize.len == 0:
# Ignore lines with either of the params that are empty.
continue
let requestsInt = requests.parseInt
if requestsInt > result.requests and projectName == "en":
result = Stats(
projectName: projectName,
pageTitle: pageTitle,
requests: requestsInt,
contentSize: contentSize.parseInt
)
proc readChunks(filename: string, chunksize = 1000000): Stats =
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
var file = open(filename)
var responses = newSeq[FlowVar[Stats]]()
var buffer = newString(chunksize)
var oldBufferLen = 0
while not endOfFile(file):
let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen
var chunkLen = readSize
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
# Find where the last line ends
chunkLen.dec
responses.add(spawn parse(buffer[0 ..< chunkLen]))
oldBufferLen = readSize - chunkLen
buffer[0 ..< oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
for resp in responses:
let statistic = ^resp
if statistic.requests > result.requests:
result = statistic
file.close()
when true:
echo readChunks(filename)
|