diff options
Diffstat (limited to 'lib/pure/parseutils.nim')
-rw-r--r-- | lib/pure/parseutils.nim | 301 |
1 files changed, 227 insertions, 74 deletions
diff --git a/lib/pure/parseutils.nim b/lib/pure/parseutils.nim index fb4bc19af..ba09347a2 100644 --- a/lib/pure/parseutils.nim +++ b/lib/pure/parseutils.nim @@ -7,9 +7,46 @@ # distribution, for details about the copyright. # -## This module contains helpers for parsing tokens, numbers, identifiers, etc. +## This module contains helpers for parsing tokens, numbers, integers, floats, +## identifiers, etc. ## ## To unpack raw bytes look at the `streams <streams.html>`_ module. +## +## +## .. code-block:: +## import parseutils +## +## let logs = @["2019-01-10: OK_", "2019-01-11: FAIL_", "2019-01: aaaa"] +## +## for log in logs: +## var res: string +## if parseUntil(log, res, ':') == 10: # YYYY-MM-DD == 10 +## echo res & " - " & captureBetween(log, ' ', '_') +## # => 2019-01-10 - OK +## +## +## .. code-block:: +## import parseutils +## from strutils import Digits, parseInt +## +## let userInput1 = "2019 school start" +## let userInput2 = "3 years back" +## +## let startYear = input1[0..skipWhile(input1, Digits)-1] # 2019 +## let yearsBack = input2[0..skipWhile(input2, Digits)-1] # 3 +## +## echo "Examination is in " & $(parseInt(startYear) + parseInt(yearsBack)) +## +## +## **See also:** +## * `strutils module<strutils.html>`_ for combined and identical parsing proc's +## * `json module<json.html>`_ for a JSON parser +## * `parsecfg module<parsecfg.html>`_ for a configuration file parser +## * `parsecsv module<parsecsv.html>`_ for a simple CSV (comma separated value) parser +## * `parseopt module<parseopt.html>`_ for a command line parser +## * `parsexml module<parsexml.html>`_ for a XML / HTML parser +## * `other parsers<lib.html#pure-libraries-parsers>`_ for other parsers + {.deadCodeElim: on.} # dce option deprecated @@ -35,21 +72,20 @@ proc parseHex*(s: string, number: var int, start = 0; maxLen = 0): int {. ## proc is sensitive to the already existing value of ``number`` and will ## likely not do what you want unless you make sure ``number`` is zero. You ## can use this feature to *chain* calls, though the result int will quickly - ## overflow. Example: - ## - ## .. code-block:: nim - ## var value = 0 - ## discard parseHex("0x38", value) - ## assert value == 56 - ## discard parseHex("0x34", value) - ## assert value == 56 * 256 + 52 - ## value = -1 - ## discard parseHex("0x38", value) - ## assert value == -200 + ## overflow. ## ## If ``maxLen == 0`` the length of the hexadecimal number has no upper bound. ## Else no more than ``start + maxLen`` characters are parsed, up to the ## length of the string. + runnableExamples: + var value = 0 + discard parseHex("0x38", value) + assert value == 56 + discard parseHex("0x34", value) + assert value == 56 * 256 + 52 + value = -1 + discard parseHex("0x38", value) + assert value == -200 var i = start var foundDigit = false # get last index based on minimum `start + maxLen` or `s.len` @@ -80,6 +116,11 @@ proc parseOct*(s: string, number: var int, start = 0, maxLen = 0): int {. ## If ``maxLen == 0`` the length of the octal number has no upper bound. ## Else no more than ``start + maxLen`` characters are parsed, up to the ## length of the string. + runnableExamples: + var res: int + doAssert parseOct("12", res) == 2 + doAssert res == 10 + doAssert parseOct("9", res) == 0 var i = start var foundDigit = false # get last index based on minimum `start + maxLen` or `s.len` @@ -95,7 +136,7 @@ proc parseOct*(s: string, number: var int, start = 0, maxLen = 0): int {. inc(i) if foundDigit: result = i-start -proc parseBin*(s: string, number: var int, start = 0, maxLen = 0): int {. +proc parseBin*(s: string, number: var int, start = 0, maxLen = 0): int {. rtl, extern: "npuParseBin", noSideEffect.} = ## Parses an binary number and stores its value in ``number``. Returns ## the number of the parsed characters or 0 in case of an error. @@ -103,6 +144,10 @@ proc parseBin*(s: string, number: var int, start = 0, maxLen = 0): int {. ## If ``maxLen == 0`` the length of the binary number has no upper bound. ## Else no more than ``start + maxLen`` characters are parsed, up to the ## length of the string. + runnableExamples: + var res: int + doAssert parseBin("010011100110100101101101", res) == 24 + doAssert parseBin("3", res) == 0 var i = start var foundDigit = false # get last index based on minimum `start + maxLen` or `s.len` @@ -119,8 +164,16 @@ proc parseBin*(s: string, number: var int, start = 0, maxLen = 0): int {. if foundDigit: result = i-start proc parseIdent*(s: string, ident: var string, start = 0): int = - ## parses an identifier and stores it in ``ident``. Returns + ## Parses an identifier and stores it in ``ident``. Returns ## the number of the parsed characters or 0 in case of an error. + runnableExamples: + var res: string + doAssert parseIdent("Hello World", res, 0) == 5 + doAssert res == "Hello" + doAssert parseIdent("Hello World", res, 1) == 4 + doAssert res == "ello" + doAssert parseIdent("Hello World", res, 6) == 5 + doAssert res == "World" var i = start if i < s.len and s[i] in IdentStartChars: inc(i) @@ -129,8 +182,13 @@ proc parseIdent*(s: string, ident: var string, start = 0): int = result = i-start proc parseIdent*(s: string, start = 0): string = - ## parses an identifier and returns it or an empty string in + ## Parses an identifier and returns it or an empty string in ## case of an error. + runnableExamples: + doAssert parseIdent("Hello World", 0) == "Hello" + doAssert parseIdent("Hello World", 1) == "ello" + doAssert parseIdent("Hello World", 5) == "" + doAssert parseIdent("Hello World", 6) == "World" result = "" var i = start if i < s.len and s[i] in IdentStartChars: @@ -138,33 +196,35 @@ proc parseIdent*(s: string, start = 0): string = while i < s.len and s[i] in IdentChars: inc(i) result = substr(s, start, i-1) -proc parseToken*(s: string, token: var string, validChars: set[char], - start = 0): int {.inline, deprecated.} = - ## parses a token and stores it in ``token``. Returns - ## the number of the parsed characters or 0 in case of an error. A token - ## consists of the characters in `validChars`. - ## - ## **Deprecated since version 0.8.12**: Use ``parseWhile`` instead. - var i = start - while i < s.len and s[i] in validChars: inc(i) - result = i-start - token = substr(s, start, i-1) - proc skipWhitespace*(s: string, start = 0): int {.inline.} = - ## skips the whitespace starting at ``s[start]``. Returns the number of + ## Skips the whitespace starting at ``s[start]``. Returns the number of ## skipped characters. + runnableExamples: + doAssert skipWhitespace("Hello World", 0) == 0 + doAssert skipWhitespace(" Hello World", 0) == 1 + doAssert skipWhitespace("Hello World", 5) == 1 + doAssert skipWhitespace("Hello World", 5) == 2 while start+result < s.len and s[start+result] in Whitespace: inc(result) proc skip*(s, token: string, start = 0): int {.inline.} = - ## skips the `token` starting at ``s[start]``. Returns the length of `token` + ## Skips the `token` starting at ``s[start]``. Returns the length of `token` ## or 0 if there was no `token` at ``s[start]``. + runnableExamples: + doAssert skip("2019-01-22", "2019", 0) == 4 + doAssert skip("2019-01-22", "19", 0) == 0 + doAssert skip("2019-01-22", "19", 2) == 2 + doAssert skip("CAPlow", "CAP", 0) == 3 + doAssert skip("CAPlow", "cap", 0) == 0 while start+result < s.len and result < token.len and s[result+start] == token[result]: inc(result) if result != token.len: result = 0 proc skipIgnoreCase*(s, token: string, start = 0): int = - ## same as `skip` but case is ignored for token matching. + ## Same as `skip` but case is ignored for token matching. + runnableExamples: + doAssert skipIgnoreCase("CAPlow", "CAP", 0) == 3 + doAssert skipIgnoreCase("CAPlow", "cap", 0) == 3 while start+result < s.len and result < token.len and toLower(s[result+start]) == toLower(token[result]): inc(result) if result != token.len: result = 0 @@ -173,24 +233,45 @@ proc skipUntil*(s: string, until: set[char], start = 0): int {.inline.} = ## Skips all characters until one char from the set `until` is found ## or the end is reached. ## Returns number of characters skipped. + runnableExamples: + doAssert skipUntil("Hello World", {'W', 'e'}, 0) == 1 + doAssert skipUntil("Hello World", {'W'}, 0) == 6 + doAssert skipUntil("Hello World", {'W', 'd'}, 0) == 6 while start+result < s.len and s[result+start] notin until: inc(result) proc skipUntil*(s: string, until: char, start = 0): int {.inline.} = ## Skips all characters until the char `until` is found ## or the end is reached. ## Returns number of characters skipped. + runnableExamples: + doAssert skipUntil("Hello World", 'o', 0) == 4 + doAssert skipUntil("Hello World", 'o', 4) == 0 + doAssert skipUntil("Hello World", 'W', 0) == 6 + doAssert skipUntil("Hello World", 'w', 0) == 11 while start+result < s.len and s[result+start] != until: inc(result) proc skipWhile*(s: string, toSkip: set[char], start = 0): int {.inline.} = ## Skips all characters while one char from the set `token` is found. ## Returns number of characters skipped. + runnableExamples: + doAssert skipWhile("Hello World", {'H', 'e'}) == 2 + doAssert skipWhile("Hello World", {'e'}) == 0 + doAssert skipWhile("Hello World", {'W', 'o', 'r'}, 6) == 3 while start+result < s.len and s[result+start] in toSkip: inc(result) proc parseUntil*(s: string, token: var string, until: set[char], start = 0): int {.inline.} = - ## parses a token and stores it in ``token``. Returns + ## Parses a token and stores it in ``token``. Returns ## the number of the parsed characters or 0 in case of an error. A token ## consists of the characters notin `until`. + runnableExamples: + var myToken: string + doAssert parseUntil("Hello World", myToken, {'W', 'o', 'r'}) == 4 + doAssert myToken == "Hell" + doAssert parseUntil("Hello World", myToken, {'W', 'r'}) == 6 + doAssert myToken == "Hello " + doAssert parseUntil("Hello World", myToken, {'W', 'r'}, 3) == 3 + doAssert myToken == "lo " var i = start while i < s.len and s[i] notin until: inc(i) result = i-start @@ -198,9 +279,17 @@ proc parseUntil*(s: string, token: var string, until: set[char], proc parseUntil*(s: string, token: var string, until: char, start = 0): int {.inline.} = - ## parses a token and stores it in ``token``. Returns + ## Parses a token and stores it in ``token``. Returns ## the number of the parsed characters or 0 in case of an error. A token ## consists of any character that is not the `until` character. + runnableExamples: + var myToken: string + doAssert parseUntil("Hello World", myToken, 'W') == 6 + doAssert myToken == "Hello " + doAssert parseUntil("Hello World", myToken, 'o') == 4 + doAssert myToken == "Hell" + doAssert parseUntil("Hello World", myToken, 'o', 2) == 2 + doAssert myToken == "ll" var i = start while i < s.len and s[i] != until: inc(i) result = i-start @@ -208,9 +297,15 @@ proc parseUntil*(s: string, token: var string, until: char, proc parseUntil*(s: string, token: var string, until: string, start = 0): int {.inline.} = - ## parses a token and stores it in ``token``. Returns + ## Parses a token and stores it in ``token``. Returns ## the number of the parsed characters or 0 in case of an error. A token ## consists of any character that comes before the `until` token. + runnableExamples: + var myToken: string + doAssert parseUntil("Hello World", myToken, "Wor") == 6 + doAssert myToken == "Hello " + doAssert parseUntil("Hello World", myToken, "Wor", 2) == 4 + doAssert myToken == "llo " if until.len == 0: token.setLen(0) return 0 @@ -227,9 +322,15 @@ proc parseUntil*(s: string, token: var string, until: string, proc parseWhile*(s: string, token: var string, validChars: set[char], start = 0): int {.inline.} = - ## parses a token and stores it in ``token``. Returns + ## Parses a token and stores it in ``token``. Returns ## the number of the parsed characters or 0 in case of an error. A token ## consists of the characters in `validChars`. + runnableExamples: + var myToken: string + doAssert parseWhile("Hello World", myToken, {'W', 'o', 'r'}, 0) == 0 + doAssert myToken.len() == 0 + doAssert parseWhile("Hello World", myToken, {'W', 'o', 'r'}, 6) == 3 + doAssert myToken == "Wor" var i = start while i < s.len and s[i] in validChars: inc(i) result = i-start @@ -238,12 +339,21 @@ proc parseWhile*(s: string, token: var string, validChars: set[char], proc captureBetween*(s: string, first: char, second = '\0', start = 0): string = ## Finds the first occurrence of ``first``, then returns everything from there ## up to ``second`` (if ``second`` is '\0', then ``first`` is used). + runnableExamples: + doAssert captureBetween("Hello World", 'e') == "llo World" + doAssert captureBetween("Hello World", 'e', 'r') == "llo Wo" + doAssert captureBetween("Hello World", 'l', start = 6) == "d" var i = skipUntil(s, first, start)+1+start result = "" discard s.parseUntil(result, if second == '\0': first else: second, i) -{.push overflowChecks: on.} -# this must be compiled with overflow checking turned on: +proc integerOutOfRangeError() {.noinline.} = + raise newException(ValueError, "Parsed integer outside of valid range") + +# See #6752 +when defined(js): + {.push overflowChecks: off.} + proc rawParseInt(s: string, b: var BiggestInt, start = 0): int = var sign: BiggestInt = -1 @@ -256,48 +366,67 @@ proc rawParseInt(s: string, b: var BiggestInt, start = 0): int = if i < s.len and s[i] in {'0'..'9'}: b = 0 while i < s.len and s[i] in {'0'..'9'}: - b = b * 10 - (ord(s[i]) - ord('0')) + let c = ord(s[i]) - ord('0') + if b >= (low(BiggestInt) + c) div 10: + b = b * 10 - c + else: + integerOutOfRangeError() inc(i) while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored - b = b * sign - result = i - start -{.pop.} # overflowChecks + if sign == -1 and b == low(BiggestInt): + integerOutOfRangeError() + else: + b = b * sign + result = i - start + +when defined(js): + {.pop.} # overflowChecks: off proc parseBiggestInt*(s: string, number: var BiggestInt, start = 0): int {. - rtl, extern: "npuParseBiggestInt", noSideEffect.} = - ## parses an integer starting at `start` and stores the value into `number`. + rtl, extern: "npuParseBiggestInt", noSideEffect, raises: [ValueError].} = + ## Parses an integer starting at `start` and stores the value into `number`. ## Result is the number of processed chars or 0 if there is no integer. - ## `OverflowError` is raised if an overflow occurs. + ## `ValueError` is raised if the parsed integer is out of the valid range. + runnableExamples: + var res: BiggestInt + doAssert parseBiggestInt("9223372036854775807", res, 0) == 19 + doAssert res == 9223372036854775807 var res: BiggestInt # use 'res' for exception safety (don't write to 'number' in case of an # overflow exception): result = rawParseInt(s, res, start) - number = res + if result != 0: + number = res proc parseInt*(s: string, number: var int, start = 0): int {. - rtl, extern: "npuParseInt", noSideEffect.} = - ## parses an integer starting at `start` and stores the value into `number`. + rtl, extern: "npuParseInt", noSideEffect, raises: [ValueError].} = + ## Parses an integer starting at `start` and stores the value into `number`. ## Result is the number of processed chars or 0 if there is no integer. - ## `OverflowError` is raised if an overflow occurs. + ## `ValueError` is raised if the parsed integer is out of the valid range. + runnableExamples: + var res: int + doAssert parseInt("2019", res, 0) == 4 + doAssert res == 2019 + doAssert parseInt("2019", res, 2) == 2 + doAssert res == 19 var res: BiggestInt result = parseBiggestInt(s, res, start) - if (sizeof(int) <= 4) and - ((res < low(int)) or (res > high(int))): - raise newException(OverflowError, "overflow") - elif result != 0: + when sizeof(int) <= 4: + if res < low(int) or res > high(int): + integerOutOfRangeError() + if result != 0: number = int(res) -proc parseSaturatedNatural*(s: string, b: var int, start = 0): int = - ## parses a natural number into ``b``. This cannot raise an overflow +proc parseSaturatedNatural*(s: string, b: var int, start = 0): int {. + raises: [].}= + ## Parses a natural number into ``b``. This cannot raise an overflow ## error. ``high(int)`` is returned for an overflow. ## The number of processed character is returned. ## This is usually what you really want to use instead of `parseInt`:idx:. - ## Example: - ## - ## .. code-block:: nim - ## var res = 0 - ## discard parseSaturatedNatural("848", res) - ## doAssert res == 848 + runnableExamples: + var res = 0 + discard parseSaturatedNatural("848", res) + doAssert res == 848 var i = start if i < s.len and s[i] == '+': inc(i) if i < s.len and s[i] in {'0'..'9'}: @@ -312,12 +441,13 @@ proc parseSaturatedNatural*(s: string, b: var int, start = 0): int = while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored result = i - start -# overflowChecks doesn't work with BiggestUInt proc rawParseUInt(s: string, b: var BiggestUInt, start = 0): int = var res = 0.BiggestUInt prev = 0.BiggestUInt i = start + if i < s.len - 1 and s[i] == '-' and s[i + 1] in {'0'..'9'}: + integerOutOfRangeError() if i < s.len and s[i] == '+': inc(i) # Allow if i < s.len and s[i] in {'0'..'9'}: b = 0 @@ -325,56 +455,75 @@ proc rawParseUInt(s: string, b: var BiggestUInt, start = 0): int = prev = res res = res * 10 + (ord(s[i]) - ord('0')).BiggestUInt if prev > res: - return 0 # overflowChecks emulation + integerOutOfRangeError() inc(i) while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored b = res result = i - start proc parseBiggestUInt*(s: string, number: var BiggestUInt, start = 0): int {. - rtl, extern: "npuParseBiggestUInt", noSideEffect.} = - ## parses an unsigned integer starting at `start` and stores the value + rtl, extern: "npuParseBiggestUInt", noSideEffect, raises: [ValueError].} = + ## Parses an unsigned integer starting at `start` and stores the value ## into `number`. - ## Result is the number of processed chars or 0 if there is no integer - ## or overflow detected. + ## `ValueError` is raised if the parsed integer is out of the valid range. + runnableExamples: + var res: BiggestUInt + doAssert parseBiggestUInt("12", res, 0) == 2 + doAssert res == 12 + doAssert parseBiggestUInt("1111111111111111111", res, 0) == 19 + doAssert res == 1111111111111111111'u64 var res: BiggestUInt # use 'res' for exception safety (don't write to 'number' in case of an # overflow exception): result = rawParseUInt(s, res, start) - number = res + if result != 0: + number = res proc parseUInt*(s: string, number: var uint, start = 0): int {. - rtl, extern: "npuParseUInt", noSideEffect.} = - ## parses an unsigned integer starting at `start` and stores the value + rtl, extern: "npuParseUInt", noSideEffect, raises: [ValueError].} = + ## Parses an unsigned integer starting at `start` and stores the value ## into `number`. - ## Result is the number of processed chars or 0 if there is no integer or - ## overflow detected. + ## `ValueError` is raised if the parsed integer is out of the valid range. + runnableExamples: + var res: uint + doAssert parseUInt("3450", res) == 4 + doAssert res == 3450 + doAssert parseUInt("3450", res, 2) == 2 + doAssert res == 50 var res: BiggestUInt result = parseBiggestUInt(s, res, start) when sizeof(BiggestUInt) > sizeof(uint) and sizeof(uint) <= 4: if res > 0xFFFF_FFFF'u64: - raise newException(OverflowError, "overflow") + integerOutOfRangeError() if result != 0: number = uint(res) proc parseBiggestFloat*(s: string, number: var BiggestFloat, start = 0): int {. magic: "ParseBiggestFloat", importc: "nimParseBiggestFloat", noSideEffect.} - ## parses a float starting at `start` and stores the value into `number`. + ## Parses a float starting at `start` and stores the value into `number`. ## Result is the number of processed chars or 0 if a parsing error ## occurred. proc parseFloat*(s: string, number: var float, start = 0): int {. rtl, extern: "npuParseFloat", noSideEffect.} = - ## parses a float starting at `start` and stores the value into `number`. + ## Parses a float starting at `start` and stores the value into `number`. ## Result is the number of processed chars or 0 if there occurred a parsing ## error. + runnableExamples: + var res: float + doAssert parseFloat("32", res, 0) == 2 + doAssert res == 32.0 + doAssert parseFloat("32.57", res, 0) == 5 + doAssert res == 32.57 + doAssert parseFloat("32.57", res, 3) == 2 + doAssert res == 57.00 var bf: BiggestFloat result = parseBiggestFloat(s, bf, start) if result != 0: number = bf type - InterpolatedKind* = enum ## describes for `interpolatedFragments` + InterpolatedKind* = enum ## Describes for `interpolatedFragments` ## which part of the interpolated string is ## yielded; for example in "str$$$var${expr}" ikStr, ## ``str`` part of the interpolated string @@ -490,4 +639,8 @@ when isMainModule: doAssert(parseSaturatedNatural("1_000_000", value) == 9) doAssert value == 1_000_000 + var i64Value: int64 + discard parseBiggestInt("9223372036854775807", i64Value) + doAssert i64Value == 9223372036854775807 + {.pop.} |