1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites
import tables, parseutils, strutils, threadpool, re
const filename = "pagecounts-20160101-050000"
type
Stats = ref object
projectName, pageTitle: string
requests, contentSize: int
proc `$`(stats: Stats): string =
"(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [
stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize
]
proc parse(chunk: string): Stats =
# Each line looks like: en Main_Page 242332 4737756101
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
var matches: array[4, string]
var reg = re"([^\s]+)\s([^\s]+)\s(\d+)\s(\d+)"
for line in chunk.splitLines:
let start = find(line, reg, matches)
if start == -1: continue
let requestsInt = matches[2].parseInt
if requestsInt > result.requests and matches[0] == "en":
result = Stats(
projectName: matches[0],
pageTitle: matches[1],
requests: requestsInt,
contentSize: matches[3].parseInt
)
proc readChunks(filename: string, chunksize = 1000000): Stats =
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
var file = open(filename)
var responses = newSeq[FlowVar[Stats]]()
var buffer = newString(chunksize)
var oldBufferLen = 0
while not endOfFile(file):
let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen
var chunkLen = readSize
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
# Find where the last line ends
chunkLen.dec
responses.add(spawn parse(buffer[0 .. <chunkLen]))
oldBufferLen = readSize - chunkLen
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
echo("Spawns: ", responses.len)
for resp in responses:
let statistic = ^resp
if statistic.requests > result.requests:
result = statistic
file.close()
when isMainModule:
echo readChunks(filename)
|