summary refs log tree commit diff stats
path: root/lib/pure/unicode.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/unicode.nim')
-rw-r--r--lib/pure/unicode.nim25
1 files changed, 25 insertions, 0 deletions
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index 4a9f4631d..5fd3c2418 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -105,6 +105,31 @@ template fastRuneAt*(s: string, i: int, result: expr, doInc = true) =
     result = Rune(ord(s[i]))
     when doInc: inc(i)
 
+proc validateUtf8*(s: string): int =
+  ## returns the position of the invalid byte in ``s`` if the string ``s`` does
+  ## not hold valid UTF-8 data. Otherwise -1 is returned.
+  var i = 0
+  let L = s.len
+  while i < L:
+    if ord(s[i]) <=% 127:
+      inc(i)
+    elif ord(s[i]) shr 5 == 0b110:
+      if i+1 < L and ord(s[i+1]) shr 6 == 0b10: inc(i, 2)
+      else: return i
+    elif ord(s[i]) shr 4 == 0b1110:
+      if i+2 < L and ord(s[i+1]) shr 6 == 0b10 and ord(s[i+2]) shr 6 == 0b10:
+        inc i, 3
+      else: return i
+    elif ord(s[i]) shr 3 == 0b11110:
+      if i+3 < L and ord(s[i+1]) shr 6 == 0b10 and
+                     ord(s[i+2]) shr 6 == 0b10 and
+                     ord(s[i+3]) shr 6 == 0b10:
+        inc i, 4
+      else: return i
+    else:
+      return i
+  return -1
+
 proc runeAt*(s: string, i: Natural): Rune =
   ## returns the unicode character in `s` at byte index `i`
   fastRuneAt(s, i, result, false)