diff options
author | bptato <nincsnevem662@gmail.com> | 2024-02-22 19:12:48 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-02-22 19:12:48 +0100 |
commit | 235cf0b2beafa177e7fa74ad6cc099ffbe5ec65f (patch) | |
tree | 432aa0ded0dbb5d77d24c48429b54cb3a00cb841 | |
parent | b3fa4b34b3c605ca9a6564493139e5ef1949dc73 (diff) | |
download | chawan-235cf0b2beafa177e7fa74ad6cc099ffbe5ec65f.tar.gz |
Make validator actually work
The API made little sense; `n' was being set almost randomly to "some byte at the current position."
-rw-r--r-- | chagashi/validator.nim | 21 | ||||
-rw-r--r-- | chagashi/validatorcore.nim | 13 | ||||
-rw-r--r-- | test/basic.nim | 6 |
3 files changed, 24 insertions, 16 deletions
diff --git a/chagashi/validator.nim b/chagashi/validator.nim index a66d2f9e..c121eeef 100644 --- a/chagashi/validator.nim +++ b/chagashi/validator.nim @@ -6,15 +6,17 @@ proc validateUTF8Surr*(s: string, start = 0): int = ## Analogous to std/unicode's validateUtf8, but also reports surrogates and ## has an optional `start` parameter. var tv = TextValidatorUTF8() + # The initial value of `n' must be -1. (Though `validate' sets it to -1 too, + # so this is not really needed.) + var n = -1 while true: - var n: int case tv.validate(s.toOpenArrayByte(0, s.high), n) of tvrDone: if tv.finish() == tvrError: - return n + return n + 1 break of tvrError: - return tv.i + return n + 1 return -1 proc toValidUTF8*(s: string): string = @@ -22,20 +24,21 @@ proc toValidUTF8*(s: string): string = var buf = "" var tv = TextValidatorUTF8() var pi = 0 + # see above + var n = -1 while true: - var n: int case tv.validate(s.toOpenArrayByte(0, s.high), n) of tvrDone: - let fr = tv.finish() - if fr == tvrError or buf.len > 0: - buf &= s.substr(pi, n - 1) - if fr == tvrError: + let r = tv.finish() + if r == tvrError or buf.len > 0: + buf &= s.substr(pi, n) + if r == tvrError: buf &= "\uFFFD" if buf.len > 0: return buf break of tvrError: - buf &= s.substr(pi, n - 1) + buf &= s.substr(pi, n) buf &= "\uFFFD" pi = tv.i return s # buf was empty; s is valid. diff --git a/chagashi/validatorcore.nim b/chagashi/validatorcore.nim index 8965bd9b..85325e7f 100644 --- a/chagashi/validatorcore.nim +++ b/chagashi/validatorcore.nim @@ -19,8 +19,15 @@ proc validate*(tv: var TextValidatorUTF8, iq: openArray[uint8], n: var int): TextValidatorResult = ## Validate the UTF-8 encoded input queue `iq`. ## - ## On success, tvrDone is returned, `n` is set to the length of `iq` (i.e. the - ## whole buffer is consumed), and `tv.i` is set to 0. + ## On success, tvrDone is returned, and `n` is set to the last valid consumed + ## index of `iq`. BEWARE: this may be lower than the highest index of `iq`; + ## for example, if the first byte is valid, `n` is set to -1. + ## + ## If `n` is less than `iq.high`, the following steps must be taken: + ## * If no more bytes exist in the queue, output an error. + ## * Store the bytes `n..iq.high` in a temporary buffer + ## * If the next call to `validate` returns tvrDone, output these + ## bytes. Otherwise, discard the bytes and output U+FFFD as usual. ## ## On failure, tvrError is returned. In this case, `n` signifies the last ## valid input byte, while `tv.i` signifies the next byte to be consumed in @@ -34,6 +41,7 @@ proc validate*(tv: var TextValidatorUTF8, iq: openArray[uint8], n: var int): ## 3. Output all bytes between the previously saved `tv.i` value and `n - 1` ## 4. Output a U+FFFD replacement character ## 5. Go to 1 (call with the same `iq` until no `tvrError` is returned). + n = -1 if tv.bounds.a == 0: # unset tv.bounds = 0x80u8 .. 0xBFu8 while (let i = tv.i; i < iq.len): @@ -69,7 +77,6 @@ proc validate*(tv: var TextValidatorUTF8, iq: openArray[uint8], n: var int): n = tv.i tv.bounds = 0x80u8 .. 0xBFu8 inc tv.i - n = tv.i tv.i = 0 tvrDone diff --git a/test/basic.nim b/test/basic.nim index 99ca5c56..fb75eba5 100644 --- a/test/basic.nim +++ b/test/basic.nim @@ -26,13 +26,11 @@ test "validate UTF-8 in parts": var r = uv.validate(ss0.toOpenArrayByte(0, ss0.high), n) # read Hell (0xC3 is not consumed yet) check r == tvrDone - check n == ss0.len + check n == 3 let ss1 = "\xB6, world!\xC3" # read 0xB6 + , world! => Hellö world! check uv.validate(ss1.toOpenArrayByte(0, ss1.high), n) == tvrDone - check n == ss1.len - # finish - check uv.finish() == tvrError + check n == ss1.high - 1 test "validate valid UTF-8": const utf8_valid = [ |