diff options
Diffstat (limited to 'lib/pure/strscans.nim')
-rw-r--r-- | lib/pure/strscans.nim | 272 |
1 files changed, 117 insertions, 155 deletions
diff --git a/lib/pure/strscans.nim b/lib/pure/strscans.nim index 11f182495..16ef9e642 100644 --- a/lib/pure/strscans.nim +++ b/lib/pure/strscans.nim @@ -10,9 +10,9 @@ ##[ This module contains a `scanf`:idx: macro that can be used for extracting substrings from an input string. This is often easier than regular expressions. -Some examples as an apetizer: +Some examples as an appetizer: -.. code-block:: nim + ```nim # check if input string matches a triple of integers: const input = "(1,2,4)" var x, y, z: int @@ -26,6 +26,7 @@ Some examples as an apetizer: var myfloat: float if scanf(input, "$i-$i-$i $w$s$f", year, month, day, identifier, myfloat): echo "yes, we have a match!" + ``` As can be seen from the examples, strings are matched verbatim except for substrings starting with ``$``. These constructions are available: @@ -35,8 +36,9 @@ substrings starting with ``$``. These constructions are available: ``$o`` Matches an octal integer. This uses ``parseutils.parseOct``. ``$i`` Matches a decimal integer. This uses ``parseutils.parseInt``. ``$h`` Matches a hex integer. This uses ``parseutils.parseHex``. -``$f`` Matches a floating pointer number. Uses ``parseFloat``. -``$w`` Matches an ASCII identifier: ``[A-Z-a-z_][A-Za-z_0-9]*``. +``$f`` Matches a floating-point number. Uses ``parseFloat``. +``$w`` Matches an ASCII identifier: ``[A-Za-z_][A-Za-z_0-9]*``. +``$c`` Matches a single ASCII character. ``$s`` Skips optional whitespace. ``$$`` Matches a single dollar sign. ``$.`` Matches if the end of the input string has been reached. @@ -51,7 +53,7 @@ substrings starting with ``$``. These constructions are available: ================= ======================================================== Even though ``$*`` and ``$+`` look similar to the regular expressions ``.*`` -and ``.+`` they work quite differently, there is no non-deterministic +and ``.+``, they work quite differently. There is no non-deterministic state machine involved and the matches are non-greedy. ``[$*]`` matches ``[xyz]`` via ``parseutils.parseUntil``. @@ -76,14 +78,13 @@ One very nice advantage over regular expressions is that ``scanf`` is extensible with ordinary Nim procs. The proc is either enclosed in ``${}`` or in ``$[]``. ``${}`` matches and binds the result to a variable (that was passed to the ``scanf`` macro) while ``$[]`` merely -optional tokens. +matches optional tokens without any result binding. In this example, we define a helper proc ``someSep`` that skips some separators which we then use in our scanf pattern to help us in the matching process: -.. code-block:: nim - + ```nim proc someSep(input: string; start: int; seps: set[char] = {':','-','.'}): int = # Note: The parameters and return value must match to what ``scanf`` requires result = 0 @@ -91,11 +92,11 @@ which we then use in our scanf pattern to help us in the matching process: if scanf(input, "$w$[someSep]$w", key, value): ... + ``` -It also possible to pass arguments to a user definable matcher: - -.. code-block:: nim +It is also possible to pass arguments to a user definable matcher: + ```nim proc ndigits(input: string; intVal: var int; start: int; n: int): int = # matches exactly ``n`` digits. Matchers need to return 0 if nothing # matched or otherwise the number of processed chars. @@ -114,6 +115,7 @@ It also possible to pass arguments to a user definable matcher: var year, month, day: int if scanf("2013-01-03", "${ndigits(4)}-${ndigits(2)}-${ndigits(2)}$.", year, month, day): ... + ``` The scanp macro @@ -129,9 +131,9 @@ to use prefix instead of postfix operators. ``+E`` One or more ``?E`` Zero or One ``E{n,m}`` From ``n`` up to ``m`` times ``E`` -``~Ε`` Not predicate +``~E`` Not predicate ``a ^* b`` Shortcut for ``?(a *(b a))``. Usually used for separators. -``a ^* b`` Shortcut for ``?(a +(b a))``. Usually used for separators. +``a ^+ b`` Shortcut for ``?(a +(b a))``. Usually used for separators. ``'a'`` Matches a single character ``{'a'..'b'}`` Matches a character set ``"s"`` Matches a string @@ -144,8 +146,7 @@ not implemented. Simple example that parses the ``/etc/passwd`` file line by line: -.. code-block:: nim - + ```nim const etc_passwd = """root:x:0:0:root:/root:/bin/bash daemon:x:1:1:daemon:/usr/sbin:/bin/sh @@ -164,17 +165,17 @@ Simple example that parses the ``/etc/passwd`` file line by line: result.add entry else: break + ``` The ``scanp`` maps the grammar code into Nim code that performs the parsing. The parsing is performed with the help of 3 helper templates that that can be implemented for a custom type. These templates need to be named ``atom`` and ``nxt``. ``atom`` should be -overloaded to handle both single characters and sets of character. +overloaded to handle both `char` and `set[char]`. -.. code-block:: nim - - import streams + ```nim + import std/streams template atom(input: Stream; idx: int; c: char): bool = ## Used in scanp for the matching of atoms (usually chars). @@ -189,11 +190,11 @@ overloaded to handle both single characters and sets of character. if scanp(content, idx, +( ~{'\L', '\0'} -> entry.add(peekChar($input))), '\L'): result.add entry + ``` Calling ordinary Nim procs inside the macro is possible: -.. code-block:: nim - + ```nim proc digits(s: string; intVal: var int; start: int): int = var x = 0 while result+start < s.len and s[result+start] in {'0'..'9'} and s[result+start] != ':': @@ -219,12 +220,12 @@ Calling ordinary Nim procs inside the macro is possible: result.add login & " " & homedir else: break + ``` When used for matching, keep in mind that likewise scanf, no backtracking is performed. -.. code-block:: nim - + ```nim proc skipUntil(s: string; until: string; unless = '\0'; start: int): int = # Skips all characters until the string `until` is found. Returns 0 # if the char `unless` is found first or the end is reached. @@ -255,12 +256,12 @@ is performed. for r in collectLinks(body): echo r + ``` In this example both macros are combined seamlessly in order to maximise efficiency and perform different checks. -.. code-block:: nim - + ```nim iterator parseIps*(soup: string): string = ## ipv4 only! const digits = {'0'..'9'} @@ -278,11 +279,16 @@ efficiency and perform different checks. yield buf buf.setLen(0) # need to clear `buf` each time, cause it might contain garbage idx.inc - + ``` ]## -import macros, parseutils +import std/[macros, parseutils] +import std/private/since + +when defined(nimPreviewSlimSystem): + import std/assertions + proc conditionsToIfChain(n, idx, res: NimNode; start: int): NimNode = assert n.kind == nnkStmtList @@ -308,7 +314,7 @@ proc buildUserCall(x: string; args: varargs[NimNode]): NimNode = for i in 1..<y.len: result.add y[i] macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): bool = - ## See top level documentation of his module of how ``scanf`` works. + ## See top level documentation of this module about how ``scanf`` works. template matchBind(parser) {.dirty.} = var resLen = genSym(nskLet, "resLen") conds.add newLetStmt(resLen, newCall(bindSym(parser), inp, results[i], idx)) @@ -317,15 +323,16 @@ macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): b template at(s: string; i: int): char = (if i < s.len: s[i] else: '\0') template matchError() = - error("type mismatch between pattern '$" & pattern[p] & "' (position: " & $p & ") and " & $getType(results[i]) & - " var '" & repr(results[i]) & "'") + error("type mismatch between pattern '$" & pattern[p] & "' (position: " & $p & + ") and " & $getTypeInst(results[i]) & " var '" & repr(results[i]) & "'") var i = 0 var p = 0 var idx = genSym(nskVar, "idx") var res = genSym(nskVar, "res") let inp = genSym(nskLet, "inp") - result = newTree(nnkStmtListExpr, newLetStmt(inp, input), newVarStmt(idx, newLit 0), newVarStmt(res, newLit false)) + result = newTree(nnkStmtListExpr, newLetStmt(inp, input), + newVarStmt(idx, newLit 0), newVarStmt(res, newLit false)) var conds = newTree(nnkStmtList) var fullMatch = false while p < pattern.len: @@ -334,7 +341,8 @@ macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): b case pattern[p] of '$': var resLen = genSym(nskLet, "resLen") - conds.add newLetStmt(resLen, newCall(bindSym"skip", inp, newLit($pattern[p]), idx)) + conds.add newLetStmt(resLen, newCall(bindSym"skip", inp, + newLit($pattern[p]), idx)) conds.add resLen.notZero conds.add resLen of 'w': @@ -343,6 +351,12 @@ macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): b else: matchError inc i + of 'c': + if i < results.len and getType(results[i]).typeKind == ntyChar: + matchBind "parseChar" + else: + matchError + inc i of 'b': if i < results.len and getType(results[i]).typeKind == ntyInt: matchBind "parseBin" @@ -374,7 +388,8 @@ macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): b matchError inc i of 's': - conds.add newCall(bindSym"inc", idx, newCall(bindSym"skipWhitespace", inp, idx)) + conds.add newCall(bindSym"inc", idx, + newCall(bindSym"skipWhitespace", inp, idx)) conds.add newEmptyNode() conds.add newEmptyNode() of '.': @@ -385,14 +400,15 @@ macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): b of '*', '+': if i < results.len and getType(results[i]).typeKind == ntyString: var min = ord(pattern[p] == '+') - var q=p+1 + var q = p+1 var token = "" while q < pattern.len and pattern[q] != '$': token.add pattern[q] inc q var resLen = genSym(nskLet, "resLen") - conds.add newLetStmt(resLen, newCall(bindSym"parseUntil", inp, results[i], newLit(token), idx)) - conds.add newCall(bindSym"!=", resLen, newLit min) + conds.add newLetStmt(resLen, newCall(bindSym"parseUntil", inp, + results[i], newLit(token), idx)) + conds.add newCall(bindSym">=", resLen, newLit min) conds.add resLen else: matchError @@ -454,12 +470,61 @@ macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): b else: result.add res +macro scanTuple*(input: untyped; pattern: static[string]; matcherTypes: varargs[untyped]): untyped {.since: (1, 5).}= + ## Works identically as scanf, but instead of predeclaring variables it returns a tuple. + ## Tuple is started with a bool which indicates if the scan was successful + ## followed by the requested data. + ## If using a user defined matcher, provide the types in order they appear after pattern: + ## `line.scanTuple("${yourMatcher()}", int)` + runnableExamples: + let (success, year, month, day, time) = scanTuple("1000-01-01 00:00:00", "$i-$i-$i$s$+") + if success: + assert year == 1000 + assert month == 1 + assert day == 1 + assert time == "00:00:00" + var + p = 0 + userMatches = 0 + arguments: seq[NimNode] + result = newStmtList() + template addVar(typ: string) = + let varIdent = ident("temp" & $arguments.len) + result.add(newNimNode(nnkVarSection).add(newIdentDefs(varIdent, ident(typ), newEmptyNode()))) + arguments.add(varIdent) + while p < pattern.len: + if pattern[p] == '$': + inc p + case pattern[p] + of 'w', '*', '+': + addVar("string") + of 'c': + addVar("char") + of 'b', 'o', 'i', 'h': + addVar("int") + of 'f': + addVar("float") + of '{': + if userMatches < matcherTypes.len: + let varIdent = ident("temp" & $arguments.len) + result.add(newNimNode(nnkVarSection).add(newIdentDefs(varIdent, matcherTypes[userMatches], newEmptyNode()))) + arguments.add(varIdent) + inc userMatches + else: discard + inc p + result.add nnkTupleConstr.newTree(newCall(ident("scanf"), input, newStrLitNode(pattern))) + for arg in arguments: + result[^1][0].add arg + result[^1].add arg + result = newBlockStmt(result) + template atom*(input: string; idx: int; c: char): bool = ## Used in scanp for the matching of atoms (usually chars). - idx < input.len and input[idx] == c + ## EOF is matched as ``'\0'``. + (idx < input.len and input[idx] == c) or (idx == input.len and c == '\0') template atom*(input: string; idx: int; s: set[char]): bool = - idx < input.len and input[idx] in s + (idx < input.len and input[idx] in s) or (idx == input.len and '\0' in s) template hasNxt*(input: string; idx: int): bool = idx < input.len @@ -469,7 +534,7 @@ template success*(x: int): bool = x != 0 template nxt*(input: string; idx, step: int = 1) = inc(idx, step) macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool = - ## See top level documentation of his module of how ``scanf`` works. + ## See top level documentation of this module about how ``scanp`` works. type StmtTriple = tuple[init, cond, action: NimNode] template interf(x): untyped = bindSym(x, brForceOpen) @@ -516,13 +581,14 @@ macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool = of nnkCallKinds: # *{'A'..'Z'} !! s.add(!_) template buildWhile(input, idx, init, cond, action): untyped = + mixin hasNxt while hasNxt(input, idx): init if not cond: break action # (x) a # bind action a to (x) - if it[0].kind == nnkPar and it.len == 2: + if it[0].kind in {nnkPar, nnkTupleConstr} and it.len == 2: result = atm(it[0], input, idx, placeholder(it[1], input, idx)) elif it.kind == nnkInfix and it[0].eqIdent"->": # bind matching to some action: @@ -562,8 +628,8 @@ macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool = if a.cond.kind == nnkEmpty or b.cond.kind == nnkEmpty: error("'|' operator applied to a non-condition") else: - result = (newStmtList(a.init, - newIfStmt((a.cond, a.action), (newTree(nnkStmtListExpr, b.init, b.cond), b.action))), + result = (newStmtList(a.init, newIfStmt((a.cond, a.action), + (newTree(nnkStmtListExpr, b.init, b.cond), b.action))), newEmptyNode(), newEmptyNode()) elif it.kind == nnkInfix and it[0].eqIdent"^*": # a ^* b is rewritten to: (a *(b a))? @@ -581,18 +647,22 @@ macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool = else: var resLen = genSym(nskLet, "resLen") result = (newLetStmt(resLen, placeholder(it, input, idx)), - newCall(interf"success", resLen), !!newCall(interf"nxt", input, idx, resLen)) + newCall(interf"success", resLen), + !!newCall(interf"nxt", input, idx, resLen)) of nnkStrLit..nnkTripleStrLit: var resLen = genSym(nskLet, "resLen") result = (newLetStmt(resLen, newCall(interf"skip", input, it, idx)), - newCall(interf"success", resLen), !!newCall(interf"nxt", input, idx, resLen)) + newCall(interf"success", resLen), + !!newCall(interf"nxt", input, idx, resLen)) of nnkCurly, nnkAccQuoted, nnkCharLit: - result = (newEmptyNode(), newCall(interf"atom", input, idx, it), !!newCall(interf"nxt", input, idx)) + result = (newEmptyNode(), newCall(interf"atom", input, idx, it), + !!newCall(interf"nxt", input, idx)) of nnkCurlyExpr: if it.len == 3 and it[1].kind == nnkIntLit and it[2].kind == nnkIntLit: var h = newTree(nnkTupleConstr, it[0]) for count in 2i64 .. it[1].intVal: h.add(it[0]) - for count in it[1].intVal .. it[2].intVal-1: h.add(newTree(nnkPrefix, ident"?", it[0])) + for count in it[1].intVal .. it[2].intVal-1: + h.add(newTree(nnkPrefix, ident"?", it[0])) result = atm(h, input, idx, attached) elif it.len == 2 and it[1].kind == nnkIntLit: var h = newTree(nnkTupleConstr, it[0]) @@ -616,7 +686,7 @@ macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool = #var idx = genSym(nskVar, "idx") var res = genSym(nskVar, "res") result = newTree(nnkStmtListExpr, #newVarStmt(idx, newCall(interf"prepare", input)), - newVarStmt(res, newLit false)) + newVarStmt(res, newLit false)) var conds: seq[StmtTriple] = @[] for it in pattern: conds.add atm(it, input, idx, nil) @@ -624,111 +694,3 @@ macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool = result.add res when defined(debugScanp): echo repr result - - -when isMainModule: - proc twoDigits(input: string; x: var int; start: int): int = - if start+1 < input.len and input[start] == '0' and input[start+1] == '0': - result = 2 - x = 13 - else: - result = 0 - - proc someSep(input: string; start: int; seps: set[char] = {';',',','-','.'}): int = - result = 0 - while start+result < input.len and input[start+result] in seps: inc result - - proc demangle(s: string; res: var string; start: int): int = - while result+start < s.len and s[result+start] in {'_', '@'}: inc result - res = "" - while result+start < s.len and s[result+start] > ' ' and s[result+start] != '_': - res.add s[result+start] - inc result - while result+start < s.len and s[result+start] > ' ': - inc result - - proc parseGDB(resp: string): seq[string] = - const - digits = {'0'..'9'} - hexdigits = digits + {'a'..'f', 'A'..'F'} - whites = {' ', '\t', '\C', '\L'} - result = @[] - var idx = 0 - while true: - var prc = "" - var info = "" - if scanp(resp, idx, *`whites`, '#', *`digits`, +`whites`, ?("0x", *`hexdigits`, " in "), - demangle($input, prc, $index), *`whites`, '(', * ~ ')', ')', - *`whites`, "at ", +(~{'\C', '\L'} -> info.add($_)) ): - result.add prc & " " & info - else: - break - - var key, val: string - var intval: int - var floatval: float - doAssert scanf("abc:: xyz 89 33.25", "$w$s::$s$w$s$i $f", key, val, intval, floatVal) - doAssert key == "abc" - doAssert val == "xyz" - doAssert intval == 89 - doAssert floatVal == 33.25 - - var binval: int - var octval: int - var hexval: int - doAssert scanf("0b0101 0o1234 0xabcd", "$b$s$o$s$h", binval, octval, hexval) - doAssert binval == 0b0101 - doAssert octval == 0o1234 - doAssert hexval == 0xabcd - - let xx = scanf("$abc", "$$$i", intval) - doAssert xx == false - - - let xx2 = scanf("$1234", "$$$i", intval) - doAssert xx2 - - let yy = scanf(";.--Breakpoint00 [output]", "$[someSep]Breakpoint${twoDigits}$[someSep({';','.','-'})] [$+]$.", intVal, key) - doAssert yy - doAssert key == "output" - doAssert intVal == 13 - - var ident = "" - var idx = 0 - let zz = scanp("foobar x x x xWZ", idx, +{'a'..'z'} -> add(ident, $_), *(*{' ', '\t'}, "x"), ~'U', "Z") - doAssert zz - doAssert ident == "foobar" - - const digits = {'0'..'9'} - var year = 0 - var idx2 = 0 - if scanp("201655-8-9", idx2, `digits`{4,6} -> (year = year * 10 + ord($_) - ord('0')), "-8", "-9"): - doAssert year == 201655 - - const gdbOut = """ - #0 @foo_96013_1208911747@8 (x0=...) - at c:/users/anwender/projects/nim/temp.nim:11 - #1 0x00417754 in tempInit000 () at c:/users/anwender/projects/nim/temp.nim:13 - #2 0x0041768d in NimMainInner () - at c:/users/anwender/projects/nim/lib/system.nim:2605 - #3 0x004176b1 in NimMain () - at c:/users/anwender/projects/nim/lib/system.nim:2613 - #4 0x004176db in main (argc=1, args=0x712cc8, env=0x711ca8) - at c:/users/anwender/projects/nim/lib/system.nim:2620""" - const result = @["foo c:/users/anwender/projects/nim/temp.nim:11", - "tempInit000 c:/users/anwender/projects/nim/temp.nim:13", - "NimMainInner c:/users/anwender/projects/nim/lib/system.nim:2605", - "NimMain c:/users/anwender/projects/nim/lib/system.nim:2613", - "main c:/users/anwender/projects/nim/lib/system.nim:2620"] - #doAssert parseGDB(gdbOut) == result - - # bug #6487 - var count = 0 - - proc test(): string = - inc count - result = ",123123" - - var a: int - discard scanf(test(), ",$i", a) - doAssert count == 1 |