about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2024-02-22 19:12:48 +0100
committerbptato <nincsnevem662@gmail.com>2024-02-22 19:12:48 +0100
commit235cf0b2beafa177e7fa74ad6cc099ffbe5ec65f (patch)
tree432aa0ded0dbb5d77d24c48429b54cb3a00cb841
parentb3fa4b34b3c605ca9a6564493139e5ef1949dc73 (diff)
downloadchawan-235cf0b2beafa177e7fa74ad6cc099ffbe5ec65f.tar.gz
Make validator actually work
The API made little sense; `n' was being set almost randomly to "some
byte at the current position."
-rw-r--r--chagashi/validator.nim21
-rw-r--r--chagashi/validatorcore.nim13
-rw-r--r--test/basic.nim6
3 files changed, 24 insertions, 16 deletions
diff --git a/chagashi/validator.nim b/chagashi/validator.nim
index a66d2f9e..c121eeef 100644
--- a/chagashi/validator.nim
+++ b/chagashi/validator.nim
@@ -6,15 +6,17 @@ proc validateUTF8Surr*(s: string, start = 0): int =
   ## Analogous to std/unicode's validateUtf8, but also reports surrogates and
   ## has an optional `start` parameter.
   var tv = TextValidatorUTF8()
+  # The initial value of `n' must be -1. (Though `validate' sets it to -1 too,
+  # so this is not really needed.)
+  var n = -1
   while true:
-    var n: int
     case tv.validate(s.toOpenArrayByte(0, s.high), n)
     of tvrDone:
       if tv.finish() == tvrError:
-        return n
+        return n + 1
       break
     of tvrError:
-      return tv.i
+      return n + 1
   return -1
 
 proc toValidUTF8*(s: string): string =
@@ -22,20 +24,21 @@ proc toValidUTF8*(s: string): string =
   var buf = ""
   var tv = TextValidatorUTF8()
   var pi = 0
+  # see above
+  var n = -1
   while true:
-    var n: int
     case tv.validate(s.toOpenArrayByte(0, s.high), n)
     of tvrDone:
-      let fr = tv.finish()
-      if fr == tvrError or buf.len > 0:
-        buf &= s.substr(pi, n - 1)
-      if fr == tvrError:
+      let r = tv.finish()
+      if r == tvrError or buf.len > 0:
+        buf &= s.substr(pi, n)
+      if r == tvrError:
         buf &= "\uFFFD"
       if buf.len > 0:
         return buf
       break
     of tvrError:
-      buf &= s.substr(pi, n - 1)
+      buf &= s.substr(pi, n)
       buf &= "\uFFFD"
       pi = tv.i
   return s # buf was empty; s is valid.
diff --git a/chagashi/validatorcore.nim b/chagashi/validatorcore.nim
index 8965bd9b..85325e7f 100644
--- a/chagashi/validatorcore.nim
+++ b/chagashi/validatorcore.nim
@@ -19,8 +19,15 @@ proc validate*(tv: var TextValidatorUTF8, iq: openArray[uint8], n: var int):
     TextValidatorResult =
   ## Validate the UTF-8 encoded input queue `iq`.
   ##
-  ## On success, tvrDone is returned, `n` is set to the length of `iq` (i.e. the
-  ## whole buffer is consumed), and `tv.i` is set to 0.
+  ## On success, tvrDone is returned, and `n` is set to the last valid consumed
+  ## index of `iq`. BEWARE: this may be lower than the highest index of `iq`;
+  ## for example, if the first byte is valid, `n` is set to -1.
+  ##
+  ## If `n` is less than `iq.high`, the following steps must be taken:
+  ## * If no more bytes exist in the queue, output an error.
+  ## * Store the bytes `n..iq.high` in a temporary buffer
+  ## * If the next call to `validate` returns tvrDone, output these
+  ##   bytes. Otherwise, discard the bytes and output U+FFFD as usual.
   ##
   ## On failure, tvrError is returned. In this case, `n` signifies the last
   ## valid input byte, while `tv.i` signifies the next byte to be consumed in
@@ -34,6 +41,7 @@ proc validate*(tv: var TextValidatorUTF8, iq: openArray[uint8], n: var int):
   ## 3. Output all bytes between the previously saved `tv.i` value and `n - 1`
   ## 4. Output a U+FFFD replacement character
   ## 5. Go to 1 (call with the same `iq` until no `tvrError` is returned).
+  n = -1
   if tv.bounds.a == 0: # unset
     tv.bounds = 0x80u8 .. 0xBFu8
   while (let i = tv.i; i < iq.len):
@@ -69,7 +77,6 @@ proc validate*(tv: var TextValidatorUTF8, iq: openArray[uint8], n: var int):
         n = tv.i
       tv.bounds = 0x80u8 .. 0xBFu8
     inc tv.i
-  n = tv.i
   tv.i = 0
   tvrDone
 
diff --git a/test/basic.nim b/test/basic.nim
index 99ca5c56..fb75eba5 100644
--- a/test/basic.nim
+++ b/test/basic.nim
@@ -26,13 +26,11 @@ test "validate UTF-8 in parts":
   var r = uv.validate(ss0.toOpenArrayByte(0, ss0.high), n)
   # read Hell (0xC3 is not consumed yet)
   check r == tvrDone
-  check n == ss0.len
+  check n == 3
   let ss1 = "\xB6, world!\xC3"
   # read 0xB6 + , world! => Hellö world!
   check uv.validate(ss1.toOpenArrayByte(0, ss1.high), n) == tvrDone
-  check n == ss1.len
-  # finish
-  check uv.finish() == tvrError
+  check n == ss1.high - 1
 
 test "validate valid UTF-8":
   const utf8_valid = [