diff options
Diffstat (limited to 'lib/pure/parseutils.nim')
-rw-r--r-- | lib/pure/parseutils.nim | 342 |
1 files changed, 342 insertions, 0 deletions
diff --git a/lib/pure/parseutils.nim b/lib/pure/parseutils.nim new file mode 100644 index 000000000..c07b713de --- /dev/null +++ b/lib/pure/parseutils.nim @@ -0,0 +1,342 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2012 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module contains helpers for parsing tokens, numbers, identifiers, etc. + +{.deadCodeElim: on.} + +{.push debugger:off .} # the user does not want to trace a part + # of the standard library! + +include "system/inclrtl" + +const + Whitespace = {' ', '\t', '\v', '\r', '\l', '\f'} + IdentChars = {'a'..'z', 'A'..'Z', '0'..'9', '_'} + IdentStartChars = {'a'..'z', 'A'..'Z', '_'} + ## copied from strutils + +proc toLower(c: char): char {.inline.} = + result = if c in {'A'..'Z'}: chr(ord(c)-ord('A')+ord('a')) else: c + +proc parseHex*(s: string, number: var int, start = 0): int {. + rtl, extern: "npuParseHex", noSideEffect.} = + ## Parses a hexadecimal number and stores its value in ``number``. + ## + ## Returns the number of the parsed characters or 0 in case of an error. This + ## proc is sensitive to the already existing value of ``number`` and will + ## likely not do what you want unless you make sure ``number`` is zero. You + ## can use this feature to *chain* calls, though the result int will quickly + ## overflow. Example: + ## + ## .. code-block:: nim + ## var value = 0 + ## discard parseHex("0x38", value) + ## assert value == 56 + ## discard parseHex("0x34", value) + ## assert value == 56 * 256 + 52 + ## value = -1 + ## discard parseHex("0x38", value) + ## assert value == -200 + ## + var i = start + var foundDigit = false + if s[i] == '0' and (s[i+1] == 'x' or s[i+1] == 'X'): inc(i, 2) + elif s[i] == '#': inc(i) + while true: + case s[i] + of '_': discard + of '0'..'9': + number = number shl 4 or (ord(s[i]) - ord('0')) + foundDigit = true + of 'a'..'f': + number = number shl 4 or (ord(s[i]) - ord('a') + 10) + foundDigit = true + of 'A'..'F': + number = number shl 4 or (ord(s[i]) - ord('A') + 10) + foundDigit = true + else: break + inc(i) + if foundDigit: result = i-start + +proc parseOct*(s: string, number: var int, start = 0): int {. + rtl, extern: "npuParseOct", noSideEffect.} = + ## parses an octal number and stores its value in ``number``. Returns + ## the number of the parsed characters or 0 in case of an error. + var i = start + var foundDigit = false + if s[i] == '0' and (s[i+1] == 'o' or s[i+1] == 'O'): inc(i, 2) + while true: + case s[i] + of '_': discard + of '0'..'7': + number = number shl 3 or (ord(s[i]) - ord('0')) + foundDigit = true + else: break + inc(i) + if foundDigit: result = i-start + +proc parseIdent*(s: string, ident: var string, start = 0): int = + ## parses an identifier and stores it in ``ident``. Returns + ## the number of the parsed characters or 0 in case of an error. + var i = start + if s[i] in IdentStartChars: + inc(i) + while s[i] in IdentChars: inc(i) + ident = substr(s, start, i-1) + result = i-start + +proc parseIdent*(s: string, start = 0): string = + ## parses an identifier and stores it in ``ident``. + ## Returns the parsed identifier or an empty string in case of an error. + result = "" + var i = start + + if s[i] in IdentStartChars: + inc(i) + while s[i] in IdentChars: inc(i) + + result = substr(s, start, i-1) + +proc parseToken*(s: string, token: var string, validChars: set[char], + start = 0): int {.inline, deprecated.} = + ## parses a token and stores it in ``token``. Returns + ## the number of the parsed characters or 0 in case of an error. A token + ## consists of the characters in `validChars`. + ## + ## **Deprecated since version 0.8.12**: Use ``parseWhile`` instead. + var i = start + while s[i] in validChars: inc(i) + result = i-start + token = substr(s, start, i-1) + +proc skipWhitespace*(s: string, start = 0): int {.inline.} = + ## skips the whitespace starting at ``s[start]``. Returns the number of + ## skipped characters. + while s[start+result] in Whitespace: inc(result) + +proc skip*(s, token: string, start = 0): int {.inline.} = + ## skips the `token` starting at ``s[start]``. Returns the length of `token` + ## or 0 if there was no `token` at ``s[start]``. + while result < token.len and s[result+start] == token[result]: inc(result) + if result != token.len: result = 0 + +proc skipIgnoreCase*(s, token: string, start = 0): int = + ## same as `skip` but case is ignored for token matching. + while result < token.len and + toLower(s[result+start]) == toLower(token[result]): inc(result) + if result != token.len: result = 0 + +proc skipUntil*(s: string, until: set[char], start = 0): int {.inline.} = + ## Skips all characters until one char from the set `until` is found + ## or the end is reached. + ## Returns number of characters skipped. + while s[result+start] notin until and s[result+start] != '\0': inc(result) + +proc skipUntil*(s: string, until: char, start = 0): int {.inline.} = + ## Skips all characters until the char `until` is found + ## or the end is reached. + ## Returns number of characters skipped. + while s[result+start] != until and s[result+start] != '\0': inc(result) + +proc skipWhile*(s: string, toSkip: set[char], start = 0): int {.inline.} = + ## Skips all characters while one char from the set `token` is found. + ## Returns number of characters skipped. + while s[result+start] in toSkip and s[result+start] != '\0': inc(result) + +proc parseUntil*(s: string, token: var string, until: set[char], + start = 0): int {.inline.} = + ## parses a token and stores it in ``token``. Returns + ## the number of the parsed characters or 0 in case of an error. A token + ## consists of the characters notin `until`. + var i = start + while i < s.len and s[i] notin until: inc(i) + result = i-start + token = substr(s, start, i-1) + +proc parseUntil*(s: string, token: var string, until: char, + start = 0): int {.inline.} = + ## parses a token and stores it in ``token``. Returns + ## the number of the parsed characters or 0 in case of an error. A token + ## consists of any character that is not the `until` character. + var i = start + while i < s.len and s[i] != until: inc(i) + result = i-start + token = substr(s, start, i-1) + +proc parseWhile*(s: string, token: var string, validChars: set[char], + start = 0): int {.inline.} = + ## parses a token and stores it in ``token``. Returns + ## the number of the parsed characters or 0 in case of an error. A token + ## consists of the characters in `validChars`. + var i = start + while s[i] in validChars: inc(i) + result = i-start + token = substr(s, start, i-1) + +proc captureBetween*(s: string, first: char, second = '\0', start = 0): string = + ## Finds the first occurrence of ``first``, then returns everything from there + ## up to ``second``(if ``second`` is '\0', then ``first`` is used). + var i = skipUntil(s, first, start)+1+start + result = "" + discard s.parseUntil(result, if second == '\0': first else: second, i) + +{.push overflowChecks: on.} +# this must be compiled with overflow checking turned on: +proc rawParseInt(s: string, b: var BiggestInt, start = 0): int = + var + sign: BiggestInt = -1 + i = start + if s[i] == '+': inc(i) + elif s[i] == '-': + inc(i) + sign = 1 + if s[i] in {'0'..'9'}: + b = 0 + while s[i] in {'0'..'9'}: + b = b * 10 - (ord(s[i]) - ord('0')) + inc(i) + while s[i] == '_': inc(i) # underscores are allowed and ignored + b = b * sign + result = i - start +{.pop.} # overflowChecks + +proc parseBiggestInt*(s: string, number: var BiggestInt, start = 0): int {. + rtl, extern: "npuParseBiggestInt", noSideEffect.} = + ## parses an integer starting at `start` and stores the value into `number`. + ## Result is the number of processed chars or 0 if there is no integer. + ## `EOverflow` is raised if an overflow occurs. + var res: BiggestInt + # use 'res' for exception safety (don't write to 'number' in case of an + # overflow exception: + result = rawParseInt(s, res, start) + number = res + +proc parseInt*(s: string, number: var int, start = 0): int {. + rtl, extern: "npuParseInt", noSideEffect.} = + ## parses an integer starting at `start` and stores the value into `number`. + ## Result is the number of processed chars or 0 if there is no integer. + ## `EOverflow` is raised if an overflow occurs. + var res: BiggestInt + result = parseBiggestInt(s, res, start) + if (sizeof(int) <= 4) and + ((res < low(int)) or (res > high(int))): + raise newException(OverflowError, "overflow") + elif result != 0: + number = int(res) + +proc parseBiggestFloat*(s: string, number: var BiggestFloat, start = 0): int {. + magic: "ParseBiggestFloat", importc: "nimParseBiggestFloat", noSideEffect.} + ## parses a float starting at `start` and stores the value into `number`. + ## Result is the number of processed chars or 0 if a parsing error + ## occurred. + +proc parseFloat*(s: string, number: var float, start = 0): int {. + rtl, extern: "npuParseFloat", noSideEffect.} = + ## parses a float starting at `start` and stores the value into `number`. + ## Result is the number of processed chars or 0 if there occurred a parsing + ## error. + var bf: BiggestFloat + result = parseBiggestFloat(s, bf, start) + if result != 0: + number = bf + +type + InterpolatedKind* = enum ## describes for `interpolatedFragments` + ## which part of the interpolated string is + ## yielded; for example in "str$$$var${expr}" + ikStr, ## ``str`` part of the interpolated string + ikDollar, ## escaped ``$`` part of the interpolated string + ikVar, ## ``var`` part of the interpolated string + ikExpr ## ``expr`` part of the interpolated string + +{.deprecated: [TInterpolatedKind: InterpolatedKind].} + +iterator interpolatedFragments*(s: string): tuple[kind: InterpolatedKind, + value: string] = + ## Tokenizes the string `s` into substrings for interpolation purposes. + ## + ## Example: + ## + ## .. code-block:: nim + ## for k, v in interpolatedFragments(" $this is ${an example} $$"): + ## echo "(", k, ", \"", v, "\")" + ## + ## Results in: + ## + ## .. code-block:: nim + ## (ikString, " ") + ## (ikExpr, "this") + ## (ikString, " is ") + ## (ikExpr, "an example") + ## (ikString, " ") + ## (ikDollar, "$") + var i = 0 + var kind: InterpolatedKind + while true: + var j = i + if s[j] == '$': + if s[j+1] == '{': + inc j, 2 + var nesting = 0 + while true: + case s[j] + of '{': inc nesting + of '}': + if nesting == 0: + inc j + break + dec nesting + of '\0': + raise newException(ValueError, + "Expected closing '}': " & substr(s, i, s.high)) + else: discard + inc j + inc i, 2 # skip ${ + kind = ikExpr + elif s[j+1] in IdentStartChars: + inc j, 2 + while s[j] in IdentChars: inc(j) + inc i # skip $ + kind = ikVar + elif s[j+1] == '$': + inc j, 2 + inc i # skip $ + kind = ikDollar + else: + raise newException(ValueError, + "Unable to parse a varible name at " & substr(s, i, s.high)) + else: + while j < s.len and s[j] != '$': inc j + kind = ikStr + if j > i: + # do not copy the trailing } for ikExpr: + yield (kind, substr(s, i, j-1-ord(kind == ikExpr))) + else: + break + i = j + +when isMainModule: + import sequtils + let input = "$test{} $this is ${an{ example}} " + let expected = @[(ikVar, "test"), (ikStr, "{} "), (ikVar, "this"), + (ikStr, " is "), (ikExpr, "an{ example}"), (ikStr, " ")] + assert toSeq(interpolatedFragments(input)) == expected + + var value = 0 + discard parseHex("0x38", value) + assert value == 56 + discard parseHex("0x34", value) + assert value == 56 * 256 + 52 + value = -1 + discard parseHex("0x38", value) + assert value == -200 + + +{.pop.} |