discard """
action: compile
"""
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites
import tables, parseutils, strutils, threadpool, re
const filename = "pagecounts-20160101-050000"
type
Stats = ref object
projectName, pageTitle: string
requests, contentSize: int
proc `$`(stats: Stats): string =
"(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [
stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize
]
proc parse(chunk: string): Stats =
# Each line looks like: en Main_Page 242332 4737756101
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
var matches: array[4, string]
var reg = re"([^\s]+)\s([^\s]+)\s(\d+)\s(\d+)"
for line in chunk.splitLines:
let start = find(line, reg, matches)
if start == -1: continue
let requestsInt = matches[2].parseInt
if requestsInt > result.requests and matches[0] == "en":
result = Stats(
projectName: matches[0],
pageTitle: matches[1],
requests: requestsInt,
contentSize: matches[3].parseInt
)
proc readChunks(filename: string, chunksize = 1000000): Stats =
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
var file = open(filename)
var responses = newSeq[FlowVar[Stats]]()
var buffer = newString(chunksize)
var oldBufferLen = 0
while not endOfFile(file):
let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen
var chunkLen = readSize
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
# Find where the last line ends
chunkLen.dec
responses.add(spawn parse(buffer[0 .. <chunkLen]))
oldBufferLen = readSize - chunkLen
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
echo("Spawns: ", responses.len)
for resp in responses:
let statistic = ^resp
if statistic.requests > result.requests:
result = statistic
file.close()
when true:
echo readChunks(filename)