summary refs log tree commit diff stats
path: root/lib/pure/unicode.nim
diff options
context:
space:
mode:
authorSimon Hafner <hafnersimon@gmail.com>2015-05-08 06:40:34 +0500
committerSimon Hafner <hafnersimon@gmail.com>2015-05-08 06:40:34 +0500
commitf5cca89610905f35b50259cfe81e6d1d4153d39c (patch)
tree3c49f45590875e4eab78ae976d6ac254030e62af /lib/pure/unicode.nim
parent2474c1bb111b38ddef64659c893722b357a27384 (diff)
parentc384f05e49e0716cc99042491f65bcc7d415d4c3 (diff)
downloadNim-f5cca89610905f35b50259cfe81e6d1d4153d39c.tar.gz
merged devel into epc
Diffstat (limited to 'lib/pure/unicode.nim')
-rw-r--r--lib/pure/unicode.nim25
1 files changed, 25 insertions, 0 deletions
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index 4a9f4631d..5fd3c2418 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -105,6 +105,31 @@ template fastRuneAt*(s: string, i: int, result: expr, doInc = true) =
     result = Rune(ord(s[i]))
     when doInc: inc(i)
 
+proc validateUtf8*(s: string): int =
+  ## returns the position of the invalid byte in ``s`` if the string ``s`` does
+  ## not hold valid UTF-8 data. Otherwise -1 is returned.
+  var i = 0
+  let L = s.len
+  while i < L:
+    if ord(s[i]) <=% 127:
+      inc(i)
+    elif ord(s[i]) shr 5 == 0b110:
+      if i+1 < L and ord(s[i+1]) shr 6 == 0b10: inc(i, 2)
+      else: return i
+    elif ord(s[i]) shr 4 == 0b1110:
+      if i+2 < L and ord(s[i+1]) shr 6 == 0b10 and ord(s[i+2]) shr 6 == 0b10:
+        inc i, 3
+      else: return i
+    elif ord(s[i]) shr 3 == 0b11110:
+      if i+3 < L and ord(s[i+1]) shr 6 == 0b10 and
+                     ord(s[i+2]) shr 6 == 0b10 and
+                     ord(s[i+3]) shr 6 == 0b10:
+        inc i, 4
+      else: return i
+    else:
+      return i
+  return -1
+
 proc runeAt*(s: string, i: Natural): Rune =
   ## returns the unicode character in `s` at byte index `i`
   fastRuneAt(s, i, result, false)