diff options
Diffstat (limited to 'tests/niminaction/Chapter6/WikipediaStats/concurrency_regex.nim')
-rw-r--r-- | tests/niminaction/Chapter6/WikipediaStats/concurrency_regex.nim | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/tests/niminaction/Chapter6/WikipediaStats/concurrency_regex.nim b/tests/niminaction/Chapter6/WikipediaStats/concurrency_regex.nim new file mode 100644 index 000000000..102313de9 --- /dev/null +++ b/tests/niminaction/Chapter6/WikipediaStats/concurrency_regex.nim @@ -0,0 +1,68 @@ +discard """ +action: compile +""" + +# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites +import tables, parseutils, strutils, threadpool, re + +const filename = "pagecounts-20160101-050000" + +type + Stats = ref object + projectName, pageTitle: string + requests, contentSize: int + +proc `$`(stats: Stats): string = + "(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [ + stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize + ] + +proc parse(chunk: string): Stats = + # Each line looks like: en Main_Page 242332 4737756101 + result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0) + + var matches: array[4, string] + var reg = re"([^\s]+)\s([^\s]+)\s(\d+)\s(\d+)" + for line in chunk.splitLines: + + let start = find(line, reg, matches) + if start == -1: continue + + let requestsInt = matches[2].parseInt + if requestsInt > result.requests and matches[0] == "en": + result = Stats( + projectName: matches[0], + pageTitle: matches[1], + requests: requestsInt, + contentSize: matches[3].parseInt + ) + +proc readChunks(filename: string, chunksize = 1000000): Stats = + result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0) + var file = open(filename) + var responses = newSeq[FlowVar[Stats]]() + var buffer = newString(chunksize) + var oldBufferLen = 0 + while not endOfFile(file): + let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen + var chunkLen = readSize + + while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines: + # Find where the last line ends + chunkLen.dec + + responses.add(spawn parse(buffer[0 ..< chunkLen])) + oldBufferLen = readSize - chunkLen + buffer[0 ..< oldBufferLen] = buffer[readSize - oldBufferLen .. ^1] + + echo("Spawns: ", responses.len) + for resp in responses: + let statistic = ^resp + if statistic.requests > result.requests: + result = statistic + + file.close() + + +when true: + echo readChunks(filename) |