This adds `parseutils.parseSize`, an inverse to `strutils.formatSize` (#21349)

* This adds `parseutils.parseSize`, an inverse to `strutils.formatSize` which has existed since 2017. It is useful for parsing the compiler's own output logs (like SuccessX) or many other scenarios where "human readable" units have been chosen. The doc comment and tests explain accepted syntax in detail. Big units lead to small numbers, often with a fractional part, but we parse into an `int64` since that is what `formatSize` stringifies and this is an inverse over partial function slots. Although metric prefixes z & y for zettabyte & yottabyte are accepted, these will saturate the result at `int64.high` unless the qualified number is a small fraction. This should not be much of a problem until such sizes are common (at which point another overload with the parse result either `float64` or `int128` could be added). Tests avoids `test()` because of a weakly related static: test() failure as mentioned in https://github.com/nim-lang/Nim/pull/21325. This is a more elemental VM failure. As such, it needs its own failure exhibition issue that is a smaller test case. (I am working on that, but unless there is a burning need to `parseSize` at compile-time before run-time it need not hold up this PR.) * This worked with `int` but fails with `int64`. Try for green tests. * Lift 2-result matching into a `checkParseSize` template and format as a table of input & 2 expected outputs which seems nicer and to address https://github.com/nim-lang/Nim/pull/21349#pullrequestreview-1294407679 * Fix (probably) the i386 trouble by using `int64` consistently. * Improve documentation by mentioning saturation. * Improve documentation with `runnableExamples` and a little more detail in the main doc comment based on excellent code review by @juancarlospaco: https://github.com/nim-lang/Nim/pull/21349#pullrequestreview-1294564155 * Address some more @juancarlospaco code review concerns. * Remove a stray space. * Mention milli-bytes in docs to maybe help clarify why wild conventions are so prone to going case-insensitive-metric. * Add some parens.
author: c-blake <c-blake@users.noreply.github.com> 2023-02-14 02:00:30 -0500
committer: GitHub <noreply@github.com> 2023-02-14 08:00:30 +0100
commit: 1d06c2b6cf84bcf65b67e88667dba4fd5e8ba519 (patch)
tree: 8c65f7f9cc880db4856d5030e1c13e34161a9a89
parent: 406d3021314c66fdf4c715efeeaea3c8f478fcaa (diff)
download: Nim-1d06c2b6cf84bcf65b67e88667dba4fd5e8ba519.tar.gz
3 files changed, 107 insertions, 1 deletions
diff --git a/changelog.md b/changelog.md
index 0389faad9..2db577d0b 100644
--- a/changelog.md
+++ b/changelog.md
@@ -10,7 +10,7 @@
 
 
 [//]: # "Additions:"
-
+- Added `parseutils.parseSize` - inverse to `strutils.formatSize` - to parse human readable sizes.
 
 [//]: # "Deprecations:"
 
diff --git a/lib/pure/parseutils.nim b/lib/pure/parseutils.nim
index 2bb23c626..c4d59d892 100644
--- a/lib/pure/parseutils.nim
+++ b/lib/pure/parseutils.nim
@@ -592,6 +592,71 @@ proc parseFloat*(s: openArray[char], number: var float): int {.
   if result != 0:
     number = bf
 
+func toLowerAscii(c: char): char =
+  if c in {'A'..'Z'}: char(uint8(c) xor 0b0010_0000'u8) else: c
+
+func parseSize*(s: openArray[char], size: var int64, alwaysBin=false): int =
+  ## Parse a size qualified by binary or metric units into `size`.  This format
+  ## is often called "human readable".  Result is the number of processed chars
+  ## or 0 on parse errors and size is rounded to the nearest integer.  Trailing
+  ## garbage like "/s" in "1k/s" is allowed and detected by `result < s.len`.
+  ##
+  ## To simplify use, following non-rare wild conventions, and since fractional
+  ## data like milli-bytes is so rare, unit matching is case-insensitive but for
+  ## the 'i' distinguishing binary-metric from metric (which cannot be 'I').
+  ##
+  ## An optional trailing 'B|b' is ignored but processed.  I.e., you must still
+  ## know if units are bytes | bits or infer this fact via the case of s[^1] (if
+  ## users can even be relied upon to use 'B' for byte and 'b' for bit or have
+  ## that be s[^1]).
+  ##
+  ## If `alwaysBin==true` then scales are always binary-metric, but e.g. "KiB"
+  ## is still accepted for clarity.  If the value would exceed the range of
+  ## `int64`, `size` saturates to `int64.high`.  Supported metric prefix chars
+  ## include k, m, g, t, p, e, z, y (but z & y saturate unless the number is a
+  ## small fraction).
+  ##
+  ## **See also:**
+  ## * https://en.wikipedia.org/wiki/Binary_prefix
+  ## * `formatSize module<strutils.html>`_ for formatting
+  runnableExamples:
+    var res: int64  # caller must still know if 'b' refers to bytes|bits
+    doAssert parseSize("10.5 MB", res) == 7
+    doAssert res == 10_500_000  # decimal metric Mega prefix
+    doAssert parseSize("64 mib", res) == 6
+    doAssert res == 67108864    # 64 shl 20
+    doAssert parseSize("1G/h", res, true) == 2 # '/' stops parse
+    doAssert res == 1073741824  # 1 shl 30, forced binary metric
+  const prefix = "b" & "kmgtpezy"       # byte|bit & lowCase metric-ish prefixes
+  const scaleM = [1.0, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21, 1e24] # 10^(3*idx)
+  const scaleB = [1.0, 1024, 1048576, 1073741824, 1099511627776.0,  # 2^(10*idx)
+                  1125899906842624.0, 1152921504606846976.0,        # ldexp?
+                  1.180591620717411303424e21, 1.208925819614629174706176e24]
+  var number: float
+  var scale = 1.0
+  result = parseFloat(s, number)
+  if number < 0:                        # While parseFloat accepts negatives ..
+    result = 0                          #.. we do not since sizes cannot be < 0
+  if result > 0:
+    let start = result                  # Save spot to maybe unwind white to EOS
+    while result < s.len and s[result] in Whitespace:
+      inc result
+    if result < s.len:                  # Illegal starting char => unity
+      if (let si = prefix.find(s[result].toLowerAscii); si >= 0):
+        inc result                      # Now parse the scale
+        scale = if alwaysBin: scaleB[si] else: scaleM[si]
+        if result < s.len and s[result] == 'i':
+          scale = scaleB[si]            # Switch from default to binary-metric
+          inc result
+        if result < s.len and s[result].toLowerAscii == 'b':
+          inc result                    # Skip optional '[bB]'
+    else:                               # Unwind result advancement when there..
+      result = start                    #..is no unit to the end of `s`.
+    var sizeF = number * scale + 0.5    # Saturate to int64.high when too big
+    size = if sizeF > 9223372036854774784.0: int64.high else: sizeF.int64
+# Above constant=2^63-1024 avoids C UB; github.com/nim-lang/Nim/issues/20102 or
+# stackoverflow.com/questions/20923556/math-pow2-63-1-math-pow2-63-512-is-true
+
 type
   InterpolatedKind* = enum ## Describes for `interpolatedFragments`
                            ## which part of the interpolated string is
diff --git a/tests/stdlib/tparseutils.nim b/tests/stdlib/tparseutils.nim
index 102e74067..218dd08f6 100644
--- a/tests/stdlib/tparseutils.nim
+++ b/tests/stdlib/tparseutils.nim
@@ -55,5 +55,46 @@ proc test() =
     doAssert res == @[(17, "9.123456789012344"), (18, "11.123456789012344"),
                       (17, "9.123456789012344"), (17, "8.123456789012344"),
                       (16, "9.12345678901234"), (17, "9.123456789012344")]
+
 test()
 static: test()
+
+block:  # With this included, static: test() crashes the compiler (from a
+        # VM problem with parseSize calling parseFloat).
+  var sz: int64
+  template checkParseSize(s, expectLen, expectVal) =
+    if (let got = parseSize(s, sz); got != expectLen):
+      raise newException(IOError, "got len " & $got & " != " & $expectLen)
+    if sz != expectVal:
+      raise newException(IOError, "got sz " & $sz & " != " & $expectVal)
+  #              STRING    LEN SZ
+  # Good, complete parses
+  checkParseSize "1  b"   , 4, 1
+  checkParseSize "1  B"   , 4, 1
+  checkParseSize "1k"     , 2, 1000
+  checkParseSize "1 kib"  , 5, 1024
+  checkParseSize "1 ki"   , 4, 1024
+  checkParseSize "1mi"    , 3, 1048576
+  checkParseSize "1 mi"   , 4, 1048576
+  checkParseSize "1 mib"  , 5, 1048576
+  checkParseSize "1 Mib"  , 5, 1048576
+  checkParseSize "1 MiB"  , 5, 1048576
+  checkParseSize "1.23GiB", 7, 1320702444 # 1320702443.52 rounded
+  checkParseSize "0.001k" , 6, 1
+  checkParseSize "0.0004k", 7, 0
+  checkParseSize "0.0006k", 7, 1
+  # Incomplete parses
+  checkParseSize "1  "    , 1, 1          # Trailing white IGNORED
+  checkParseSize "1  B "  , 4, 1          # Trailing white IGNORED
+  checkParseSize "1  B/s" , 4, 1          # Trailing junk IGNORED
+  checkParseSize "1 kX"   , 3, 1000
+  checkParseSize "1 kiX"  , 4, 1024
+  checkParseSize "1j"     , 1, 1          # Unknown prefix IGNORED
+  checkParseSize "1 jib"  , 2, 1          # Unknown prefix post space
+  checkParseSize "1  ji"  , 3, 1
+  # Bad parses; `sz` should stay last good|incomplete value
+  checkParseSize "-1b"    , 0, 1          # Negative numbers
+  checkParseSize "abc"    , 0, 1          # Non-numeric
+  checkParseSize " 12"    , 0, 1          # Leading white
+  # Value Edge cases
+  checkParseSize "9223372036854775807", 19, int64.high
author	c-blake <c-blake@users.noreply.github.com>	2023-02-14 02:00:30 -0500
committer	GitHub <noreply@github.com>	2023-02-14 08:00:30 +0100
commit	1d06c2b6cf84bcf65b67e88667dba4fd5e8ba519 (patch)
tree	8c65f7f9cc880db4856d5030e1c13e34161a9a89
parent	406d3021314c66fdf4c715efeeaea3c8f478fcaa (diff)
download	Nim-1d06c2b6cf84bcf65b67e88667dba4fd5e8ba519.tar.gz