#
#
# Nim's Runtime Library
# (c) Copyright 2016 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
##[
This module contains a `scanf`:idx: macro that can be used for extracting
substrings from an input string. This is often easier than regular expressions.
Some examples as an apetizer:
.. code-block:: nim
# check if input string matches a triple of integers:
const input = "(1,2,4)"
var x, y, z: int
if scanf(input, "($i,$i,$i)", x, y, z):
echo "matches and x is ", x, " y is ", y, " z is ", z
# check if input string matches an ISO date followed by an identifier followed
# by whitespace and a floating point number:
var year, month, day: int
var identifier: string
var myfloat: float
if scanf(input, "$i-$i-$i $w$s$f", year, month, day, identifier, myfloat):
echo "yes, we have a match!"
As can be seen from the examples, strings are matched verbatim except for
substrings starting with ``$``. These constructions are available:
================= ========================================================
``$i`` Matches an integer. This uses ``parseutils.parseInt``.
``$f`` Matches a floating pointer number. Uses ``parseFloat``.
``$w`` Matches an ASCII identifier: ``[A-Z-a-z_][A-Za-z_0-9]*``.
``$s`` Skips optional whitespace.
``$$`` Matches a single dollar sign.
``$.`` Matches if the end of the input string has been reached.
``$*`` Matches until the token following the ``$*`` was found.
The match is allowed to be of 0 length.
``$+`` Matches until the token following the ``$+`` was found.
The match must consist of at least one char.
``${foo}`` User defined matcher. Uses the proc ``foo`` to perform
the match. See below for more details.
``$[foo]`` Call user defined proc ``foo`` to **skip** some optional
parts in the input string. See below for more details.
================= ========================================================
Even though ``$*`` and ``$+`` look similar to the regular expressions ``.*``
and ``.+`` they work quite differently, there is no non-deterministic
state machine involved and the matches are non-greedy. ``[$*]``
matches ``[xyz]`` via ``parseutils.parseUntil``.
Furthermore no backtracking is performed, if parsing fails after a value
has already been bound to a matched subexpression this value is not restored
to its original value. This rarely causes problems in practice and if it does
for you, it's easy enough to bind to a temporary variable first.
Startswith vs full match
========================
``scanf`` returns true if the input string **starts with** the specified
pattern. If instead it should only return true if there is also nothing
left in the input, append ``$.`` to your pattern.
User definable matchers
=======================
One very nice advantage over regular expressions is that ``scanf`` is
extensible with ordinary Nim procs. The proc is either enclosed in ``${}``
or in ``$[]``. ``${}`` matches and binds the result
to a variable (that was passed to the ``scanf`` macro) while ``$[]`` merely
optional tokens.
In this example, we define a helper proc ``someSep`` that skips some separators
which we then use in our scanf pattern to help us in the matching process:
.. code-block:: nim
proc someSep(input: string; start: int; seps: set[char] = {':','-','.'}): int =
# Note: The parameters and return value must match to what ``scanf`` requires
result = 0
while input[start+result] in seps: inc result
if scanf(input, "$w$[someSep]$w", key, value):
...
It also possible to pass arguments to a user definable matcher:
.. code-block:: nim
proc ndigits(input: string; intVal: var int; start: int; n: int): int =
# matches exactly ``n`` digits. Matchers need to return 0 if nothing
# matched or otherwise the number of processed chars.
var x = 0
var i = 0
while i < n and i+start < input.len and input[i+start] in {'0'..'9'}:
x = x * 10 + input[i+start].ord - '0'.ord
inc i
# only overwrite if we had a match
if i == n:
result = n
intVal = x
# match an ISO date extracting year, month, day at the same time.
# Also ensure the input ends after the ISO date:
var year, month, day: int
if scanf("2013-01-03", "${ndigits(4)}-${ndigits(2)}-${ndigits(2)}$.", year, month, day):
...
The scanp macro
===============
This module also implements a ``scanp`` macro, which syntax somewhat resembles
an EBNF or PEG grammar, except that it uses Nim's expression syntax and so has
to use prefix instead of postfix operators.
============== ===============================================================
``(E)`` Grouping
``*E`` Zero or more
``+E`` One or more
``?E`` Zero or One
``E{n,m}`` From ``n`` up to ``m`` times ``E``
``~Ε`` Not predicate
``a ^* b`` Shortcut for ``?(a *(b a))``. Usually used for separators.
``a ^* b`` Shortcut for ``?(a +(b a))``. Usually used for separators.
``'a'`` Matches a single character
``{'a'..'b'}`` Matches a character set
``"s"`` Matches a string
``E -> a`` Bind matching to some action
``$_`` Access the currently matched character
============== ===============================================================
Note that unordered or ordered choice operators (``/``, ``|``) are
not implemented.
Simple example that parses the ``/etc/passwd`` file line by line:
.. code-block:: nim
const
etc_passwd = """root:x:0:0:root:/root:/bin/bash
daemon:x:1:1:daemon:/usr/sbin:/bin/sh
bin:x:2:2:bin:/bin:/bin/sh
sys:x:3:3:sys:/dev:/bin/sh
nobody:x:65534:65534:nobody:/nonexistent:/bin/sh
messagebus:x:103:107::/var/run/dbus:/bin/false
"""
proc parsePasswd(content: string): seq[string] =
result = @[]
var idx = 0
while true:
var entry = ""
if scanp(content, idx, +(~{'\L', '\0'} -> entry.add($_)), '\L'):
result.add entry
else:
break
The ``scanp`` maps the grammar code into Nim code that performs the parsing.
The parsing is performed with the help of 3 helper templates that that can be
implemented for a custom type.
These templates need to be named ``atom`` and ``nxt``. ``atom`` should be
overloaded to handle both single characters and sets of character.
.. code-block:: nim
import streams
template atom(input: Stream; idx: int; c: char): bool =
## Used in scanp for the matching of atoms (usually chars).
peekChar(input) == c
template atom(input: Stream; idx: int; s: set[char]): bool =
peekChar(input) in s
template nxt(input: Stream; idx, step: int = 1) =
inc(idx, step)
setPosition(input, idx)
if scanp(content, idx, +( ~{'\L', '\0'} -> entry.add(peekChar($input))), '\L'):
result.add entry
Calling ordinary Nim procs inside the macro is possible:
.. code-block:: nim
proc digits(s: string; intVal: var int; start: int): int =
var x = 0
while result+start < s.len and s[result+start] in {'0'..'9'} and s[result+start] != ':':
x = x * 10 + s[result+start].ord - '0'.ord
inc result
intVal = x
proc extractUsers(content: string): seq[string] =
# Extracts the username and home directory
# of each entry (with UID greater than 1000)
const
digits = {'0'..'9'}
result = @[]
var idx = 0
while true:
var login = ""
var uid = 0
var homedir = ""
if scanp(content, idx, *(~ {':', '\0'}) -> login.add($_), ':', * ~ ':', ':',
digits($input, uid, $index), ':', *`digits`, ':', * ~ ':', ':',
*('/', * ~{':', '/'}) -> homedir.add($_), ':', *('/', * ~{'\L', '/'}), '\L'):
if uid >= 1000:
result.add login & " " & homedir
else:
break
When used for matching, keep in mind that likewise scanf, no backtracking
is performed.
.. code-block:: nim
proc skipUntil(s: string; until: string; unless = '\0'; start: int): int =
# Skips all characters until the string `until` is found. Returns 0
# if the char `unless` is found first or the end is reached.
var i = start
var u = 0
while true:
if s[i] == '\0' or s[i] == unless:
return 0
elif s[i] == until[0]:
u = 1
while i+u < s.len and u < until.len and s[i+u] == until[u]:
inc u
if u >= until.len: break
inc(i)
result = i+u-start
iterator collectLinks(s: string): string =
const quote = {'\'', '"'}
var idx, old = 0
var res = ""
while idx < s.len:
old = idx
if scanp(s, idx, "<a", skipUntil($input, "href=", '>', $index),
`quote`, *( ~`quote`) -> res.add($_)):
yield res
res = ""
idx = old + 1
for r in collectLinks(body):
echo r
In this example both macros are combined seamlessly in order to maximise
efficiency and perform different checks.
.. code-block:: nim
iterator parseIps*(soup: string): string =
## ipv4 only!
const digits = {'0'..'9'}
var a, b, c, d: int
var buf = ""
var idx = 0
while idx < soup.len:
if scanp(soup, idx, (`digits`{1,3}, '.', `digits`{1,3}, '.',
`digits`{1,3}, '.', `digits`{1,3}) -> buf.add($_)):
discard buf.scanf("$i.$i.$i.$i", a, b, c, d)
if (a >= 0 and a <= 254) and
(b >= 0 and b <= 254) and
(c >= 0 and c <= 254) and
(d >= 0 and d <= 254):
yield buf
buf.setLen(0) # need to clear `buf` each time, cause it might contain garbage
idx.inc
]##
import macros, parseutils
proc conditionsToIfChain(n, idx, res: NimNode; start: int): NimNode =
assert n.kind == nnkStmtList
if start >= n.len: return newAssignment(res, newLit true)
var ifs: NimNode = nil
if n[start+1].kind == nnkEmpty:
ifs = conditionsToIfChain(n, idx, res, start+3)
else:
ifs = newIfStmt((n[start+1],
newTree(nnkStmtList, newCall(bindSym"inc", idx, n[start+2]),
conditionsToIfChain(n, idx, res, start+3))))
result = newTree(nnkStmtList, n[start], ifs)
proc notZero(x: NimNode): NimNode = newCall(bindSym"!=", x, newLit 0)
proc buildUserCall(x: string; args: varargs[NimNode]): NimNode =
let y = parseExpr(x)
result = newTree(nnkCall)
if y.kind in nnkCallKinds: result.add y[0]
else: result.add y
for a in args: result.add a
if y.kind in nnkCallKinds:
for i in 1..<y.len: result.add y[i]
macro scanf*(input: string; pattern: static[string]; results: varargs[typed]): bool =
## See top level documentation of his module of how ``scanf`` works.
template matchBind(parser) {.dirty.} =
var resLen = genSym(nskLet, "resLen")
conds.add newLetStmt(resLen, newCall(bindSym(parser), inp, results[i], idx))
conds.add resLen.notZero
conds.add resLen
var i = 0
var p = 0
var idx = genSym(nskVar, "idx")
var res = genSym(nskVar, "res")
let inp = genSym(nskLet, "inp")
result = newTree(nnkStmtListExpr, newLetStmt(inp, input), newVarStmt(idx, newLit 0), newVarStmt(res, newLit false))
var conds = newTree(nnkStmtList)
var fullMatch = false
while p < pattern.len:
if pattern[p] == '$':
inc p
case pattern[p]
of '$':
var resLen = genSym(nskLet, "resLen")
conds.add newLetStmt(resLen, newCall(bindSym"skip", inp, newLit($pattern[p]), idx))
conds.add resLen.notZero
conds.add resLen
of 'w':
if i < results.len or getType(results[i]).typeKind != ntyString:
matchBind "parseIdent"
else:
error("no string var given for $w")
inc i
of 'i':
if i < results.len or getType(results[i]).typeKind != ntyInt:
matchBind "parseInt"
else:
error("no int var given for $d")
inc i
of 'f':
if i < results.len or getType(results[i]).typeKind != ntyFloat:
matchBind "parseFloat"
else:
error("no float var given for $f")
inc i
of 's':
conds.add newCall(bindSym"inc", idx, newCall(bindSym"skipWhitespace", inp, idx))
conds.add newEmptyNode()
conds.add newEmptyNode()
of '.':
if p == pattern.len-1:
fullMatch = true
else:
error("invalid format string")
of '*', '+':
if i < results.len or getType(results[i]).typeKind != ntyString:
var min = ord(pattern[p] == '+')
var q=p+1
var token = ""
while q < pattern.len and pattern[q] != '$':
token.add pattern[q]
inc q
var resLen = genSym(nskLet, "resLen")
conds.add newLetStmt(resLen, newCall(bindSym"parseUntil", inp, results[i], newLit(token), idx))
conds.add newCall(bindSym"!=", resLen, newLit min)
conds.add resLen
else:
error("no string var given for $" & pattern[p])
inc i
of '{':
inc p
var nesting = 0
let start = p
while true:
case pattern[p]
of '{': inc nesting
of '}':
if nesting == 0: break
dec nesting
of '\0': error("expected closing '}'")
else: discard
inc p
let expr = pattern.substr(start, p-1)
if i < results.len:
var resLen = genSym(nskLet, "resLen")
conds.add newLetStmt(resLen, buildUserCall(expr, inp, results[i], idx))
conds.add newCall(bindSym"!=", resLen, newLit 0)
conds.add resLen
else:
error("no var given for $" & expr)
inc i
of '[':
inc p
var nesting = 0
let start = p
while true:
case pattern[p]
of '[': inc nesting
of ']':
if nesting == 0: break
dec nesting
of '\0': error("expected closing ']'")
else: discard
inc p
let expr = pattern.substr(start, p-1)
conds.add newCall(bindSym"inc", idx, buildUserCall(expr, inp, idx))
conds.add newEmptyNode()
conds.add newEmptyNode()
else: error("invalid format string")
inc p
else:
var token = ""
while p < pattern.len and pattern[p] != '$':
token.add pattern[p]
inc p
var resLen = genSym(nskLet, "resLen")
conds.add newLetStmt(resLen, newCall(bindSym"skip", inp, newLit(token), idx))
conds.add resLen.notZero
conds.add resLen
result.add conditionsToIfChain(conds, idx, res, 0)
if fullMatch:
result.add newCall(bindSym"and", res,
newCall(bindSym">=", idx, newCall(bindSym"len", inp)))
else:
result.add res
template atom*(input: string; idx: int; c: char): bool =
## Used in scanp for the matching of atoms (usually chars).
input[idx] == c
template atom*(input: string; idx: int; s: set[char]): bool =
input[idx] in s
#template prepare*(input: string): int = 0
template success*(x: int): bool = x != 0
template nxt*(input: string; idx, step: int = 1) = inc(idx, step)
macro scanp*(input, idx: typed; pattern: varargs[untyped]): bool =
## See top level documentation of his module of how ``scanp`` works.
type StmtTriple = tuple[init, cond, action: NimNode]
template interf(x): untyped = bindSym(x, brForceOpen)
proc toIfChain(n: seq[StmtTriple]; idx, res: NimNode; start: int): NimNode =
if start >= n.len: return newAssignment(res, newLit true)
var ifs: NimNode = nil
if n[start].cond.kind == nnkEmpty:
ifs = toIfChain(n, idx, res, start+1)
else:
ifs = newIfStmt((n[start].cond,
newTree(nnkStmtList, n[start].action,
toIfChain(n, idx, res, start+1))))
result = newTree(nnkStmtList, n[start].init, ifs)
proc attach(x, attached: NimNode): NimNode =
if attached == nil: x
else: newStmtList(attached, x)
proc placeholder(n, x, j: NimNode): NimNode =
if n.kind == nnkPrefix and n[0].eqIdent("$"):
let n1 = n[1]
if n1.eqIdent"_" or n1.eqIdent"current":
result = newTree(nnkBracketExpr, x, j)
elif n1.eqIdent"input":
result = x
elif n1.eqIdent"i" or n1.eqIdent"index":
result = j
else:
error("unknown pattern " & repr(n))
else:
result = copyNimNode(n)
for i in 0 ..< n.len:
result.add placeholder(n[i], x, j)
proc atm(it, input, idx, attached: NimNode): StmtTriple =
template `!!`(x): untyped = attach(x, attached)
case it.kind
of nnkIdent:
var resLen = genSym(nskLet, "resLen")
result = (newLetStmt(resLen, newCall(it, input, idx)),
newCall(interf"success", resLen),
!!newCall(interf"nxt", input, idx, resLen))
of nnkCallKinds:
# *{'A'..'Z'} !! s.add(!_)
template buildWhile(init, cond, action): untyped =
while true:
init
if not cond: break
action
# (x) a # bind action a to (x)
if it[0].kind == nnkPar and it.len == 2:
result = atm(it[0], input, idx, placeholder(it[1], input, idx))
elif it.kind == nnkInfix and it[0].eqIdent"->":
# bind matching to some action:
result = atm(it[1], input, idx, placeholder(it[2], input, idx))
elif it.kind == nnkInfix and it[0].eqIdent"as":
let cond = if it[1].kind in nnkCallKinds: placeholder(it[1], input, idx)
else: newCall(it[1], input, idx)
result = (newLetStmt(it[2], cond),
newCall(interf"success", it[2]),
!!newCall(interf"nxt", input, idx, it[2]))
elif it.kind == nnkPrefix and it[0].eqIdent"*":
let (init, cond, action) = atm(it[1], input, idx, attached)
result = (getAst(buildWhile(init, cond, action)),
newEmptyNode(), newEmptyNode())
elif it.kind == nnkPrefix and it[0].eqIdent"+":
# x+ is the same as xx*
result = atm(newTree(nnkPar, it[1], newTree(nnkPrefix, ident"*", it[1])),
input, idx, attached)
elif it.kind == nnkPrefix and it[0].eqIdent"?":
# optional.
let (init, cond, action) = atm(it[1], input, idx, attached)
if cond.kind == nnkEmpty:
error("'?' operator applied to a non-condition")
else:
result = (newTree(nnkStmtList, init, newIfStmt((cond, action))),
newEmptyNode(), newEmptyNode())
elif it.kind == nnkPrefix and it[0].eqIdent"~":
# not operator
let (init, cond, action) = atm(it[1], input, idx, attached)
if cond.kind == nnkEmpty:
error("'~' operator applied to a non-condition")
else:
result = (init, newCall(bindSym"not", cond), action)
elif it.kind == nnkInfix and it[0].eqIdent"|":
let a = atm(it[1], input, idx, attached)
let b = atm(it[2], input, idx, attached)
if a.cond.kind == nnkEmpty or b.cond.kind == nnkEmpty:
error("'|' operator applied to a non-condition")
else:
result = (newStmtList(a.init,
newIfStmt((a.cond, a.action), (newTree(nnkStmtListExpr, b.init, b.cond), b.action))),
newEmptyNode(), newEmptyNode())
elif it.kind == nnkInfix and it[0].eqIdent"^*":
# a ^* b is rewritten to: (a *(b a))?
#exprList = expr ^+ comma
template tmp(a, b): untyped = ?(a, *(b, a))
result = atm(getAst(tmp(it[1], it[2])), input, idx, attached)
elif it.kind == nnkInfix and it[0].eqIdent"^+":
# a ^* b is rewritten to: (a +(b a))?
template tmp(a, b): untyped = (a, *(b, a))
result = atm(getAst(tmp(it[1], it[2])), input, idx, attached)
elif it.kind == nnkCommand and it.len == 2 and it[0].eqIdent"pred":
# enforce that the wrapped call is interpreted as a predicate, not a non-terminal:
result = (newEmptyNode(), placeholder(it[1], input, idx), newEmptyNode())
else:
var resLen = genSym(nskLet, "resLen")
result = (newLetStmt(resLen, placeholder(it, input, idx)),
newCall(interf"success", resLen), !!newCall(interf"nxt", input, idx, resLen))
of nnkStrLit..nnkTripleStrLit:
var resLen = genSym(nskLet, "resLen")
result = (newLetStmt(resLen, newCall(interf"skip", input, it, idx)),
newCall(interf"success", resLen), !!newCall(interf"nxt", input, idx, resLen))
of nnkCurly, nnkAccQuoted, nnkCharLit:
result = (newEmptyNode(), newCall(interf"atom", input, idx, it), !!newCall(interf"nxt", input, idx))
of nnkCurlyExpr:
if it.len == 3 and it[1].kind == nnkIntLit and it[2].kind == nnkIntLit:
var h = newTree(nnkPar, it[0])
for count in 2..it[1].intVal: h.add(it[0])
for count in it[1].intVal .. it[2].intVal-1: h.add(newTree(nnkPrefix, ident"?", it[0]))
result = atm(h, input, idx, attached)
elif it.len == 2 and it[1].kind == nnkIntLit:
var h = newTree(nnkPar, it[0])
for count in 2..it[1].intVal: h.add(it[0])
result = atm(h, input, idx, attached)
else:
error("invalid pattern")
of nnkPar:
if it.len == 1:
result = atm(it[0], input, idx, attached)
else:
# concatenation:
var conds: seq[StmtTriple] = @[]
for x in it: conds.add atm(x, input, idx, attached)
var res = genSym(nskVar, "res")
result = (newStmtList(newVarStmt(res, newLit false),
toIfChain(conds, idx, res, 0)), res, newEmptyNode())
else:
error("invalid pattern")
#var idx = genSym(nskVar, "idx")
var res = genSym(nskVar, "res")
result = newTree(nnkStmtListExpr, #newVarStmt(idx, newCall(interf"prepare", input)),
newVarStmt(res, newLit false))
var conds: seq[StmtTriple] = @[]
for it in pattern:
conds.add atm(it, input, idx, nil)
result.add toIfChain(conds, idx, res, 0)
result.add res
when defined(debugScanp):
echo repr result
when isMainModule:
proc twoDigits(input: string; x: var int; start: int): int =
if input[start] == '0' and input[start+1] == '0':
result = 2
x = 13
else:
result = 0
proc someSep(input: string; start: int; seps: set[char] = {';',',','-','.'}): int =
result = 0
while input[start+result] in seps: inc result
proc demangle(s: string; res: var string; start: int): int =
while s[result+start] in {'_', '@'}: inc result
res = ""
while result+start < s.len and s[result+start] > ' ' and s[result+start] != '_':
res.add s[result+start]
inc result
while result+start < s.len and s[result+start] > ' ':
inc result
proc parseGDB(resp: string): seq[string] =
const
digits = {'0'..'9'}
hexdigits = digits + {'a'..'f', 'A'..'F'}
whites = {' ', '\t', '\C', '\L'}
result = @[]
var idx = 0
while true:
var prc = ""
var info = ""
if scanp(resp, idx, *`whites`, '#', *`digits`, +`whites`, ?("0x", *`hexdigits`, " in "),
demangle($input, prc, $index), *`whites`, '(', * ~ ')', ')',
*`whites`, "at ", +(~{'\C', '\L', '\0'} -> info.add($_)) ):
result.add prc & " " & info
else:
break
var key, val: string
var intval: int
var floatval: float
doAssert scanf("abc:: xyz 89 33.25", "$w$s::$s$w$s$i $f", key, val, intval, floatVal)
doAssert key == "abc"
doAssert val == "xyz"
doAssert intval == 89
doAssert floatVal == 33.25
let xx = scanf("$abc", "$$$i", intval)
doAssert xx == false
let xx2 = scanf("$1234", "$$$i", intval)
doAssert xx2
let yy = scanf(";.--Breakpoint00 [output]", "$[someSep]Breakpoint${twoDigits}$[someSep({';','.','-'})] [$+]$.", intVal, key)
doAssert yy
doAssert key == "output"
doAssert intVal == 13
var ident = ""
var idx = 0
let zz = scanp("foobar x x x xWZ", idx, +{'a'..'z'} -> add(ident, $_), *(*{' ', '\t'}, "x"), ~'U', "Z")
doAssert zz
doAssert ident == "foobar"
const digits = {'0'..'9'}
var year = 0
var idx2 = 0
if scanp("201655-8-9", idx2, `digits`{4,6} -> (year = year * 10 + ord($_) - ord('0')), "-8", "-9"):
doAssert year == 201655
const gdbOut = """
#0 @foo_96013_1208911747@8 (x0=...)
at c:/users/anwender/projects/nim/temp.nim:11
#1 0x00417754 in tempInit000 () at c:/users/anwender/projects/nim/temp.nim:13
#2 0x0041768d in NimMainInner ()
at c:/users/anwender/projects/nim/lib/system.nim:2605
#3 0x004176b1 in NimMain ()
at c:/users/anwender/projects/nim/lib/system.nim:2613
#4 0x004176db in main (argc=1, args=0x712cc8, env=0x711ca8)
at c:/users/anwender/projects/nim/lib/system.nim:2620"""
const result = @["foo c:/users/anwender/projects/nim/temp.nim:11",
"tempInit000 c:/users/anwender/projects/nim/temp.nim:13",
"NimMainInner c:/users/anwender/projects/nim/lib/system.nim:2605",
"NimMain c:/users/anwender/projects/nim/lib/system.nim:2613",
"main c:/users/anwender/projects/nim/lib/system.nim:2620"]
doAssert parseGDB(gdbOut) == result
# bug #6487
var count = 0
proc test(): string =
inc count
result = ",123123"
var a: int
discard scanf(test(), ",$i", a)
doAssert count == 1