Replaced parser, readline, etc...

author: bptato <nincsnevem662@gmail.com> 2021-03-12 20:52:44 +0100
committer: bptato <nincsnevem662@gmail.com> 2021-03-12 20:53:18 +0100
commit: 6084e104d5c3868196c9e2d3748c6627cf983470 (patch)
tree: f5103ffd0b33049e74b44f7a91448a22e3c8ef74 /twtstr.nim
parent: 424f52e1f2eff84ee23a79b38e7932c2918351ed (diff)
download: chawan-6084e104d5c3868196c9e2d3748c6627cf983470.tar.gz
1 files changed, 63 insertions, 36 deletions
diff --git a/twtstr.nim b/twtstr.nim
index 63854879..42a64333 100644
--- a/twtstr.nim
+++ b/twtstr.nim
@@ -48,8 +48,18 @@ func remove*(str: string, c: string): string =
     if rem != rune:
       result &= $rune
 
+const ControlChars = {chr(0x00)..chr(0x1F), chr(0x7F)}
+
+const Whitespace = { ' ', '\n', '\r', '\t' }
+
+func isWhitespace*(c: char): bool =
+  return c in Whitespace
+
 func isControlChar*(c: char): bool =
-  return c <= char(0x1F) or c == char(0x7F)
+  return c in ControlChars
+
+func isControlChar*(r: Rune): bool =
+  return int(r) <= int(high(char)) and char(r) in ControlChars
 
 func getControlChar*(c: char): char =
   if c >= 'a':
@@ -84,11 +94,40 @@ func findChar*(str: string, c: Rune, start: int = 0): int =
     i = n
   return -1
 
+func getLowerChars*(): string =
+  result = ""
+  for i in 0..255:
+    if chr(i) >= 'A' and chr(i) <= 'Z':
+      result &= chr(i + 32)
+    else:
+      result &= chr(i)
+
+const lowerChars = getLowerChars()
+
+func tolower*(c: char): char =
+  return lowerChars[int(c)]
+
+const breakWord = [
+  Rune('\n'), Rune('/'), Rune('\\'), Rune(' '), Rune('&'), Rune('='),
+  Rune('?'), Rune('.'), Rune(';')
+]
+
+func breaksWord*(r: Rune): bool =
+  return r in breakWord
+
+func substr*(s: seq[Rune], i: int, j: int): seq[Rune] =
+  if s.len == 0:
+    return @[]
+  return s[min(high(s), i)..min(high(s), j - 1)]
 
-#Measure length of rune. Transpiled from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+func substr*(s: seq[Rune], i: int): seq[Rune] =
+  if i >= high(s) or s.len == 0:
+    return @[]
+  return s[min(high(s), i)..high(s)]
+
+#Measure length of rune. From https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 
 #auxiliary function for binary search in interval table
-#TODO: use binary search from stdlib?
 func bisearch(ucs: Rune, table: openarray[(int, int)]): bool =
   var max = table.high
   var min = 0
@@ -107,14 +146,13 @@ func bisearch(ucs: Rune, table: openarray[(int, int)]): bool =
       return true
   return false
 
-
 #The following two functions define the column width of an ISO 10646
 #character as follows:
 #
 #   - The null character (U+0000) has a column width of 0.
 #
 #   - Other C0/C1 control characters and DEL will lead to a return
-#     value of -1.
+#     value of 2.
 #
 #   - Non-spacing and enclosing combining characters (general
 #     category code Mn or Me in the Unicode database) have a
@@ -191,21 +229,18 @@ const combining = [
   ( 0xE0100, 0xE01EF )
 ]
 
-func mk_wcwidth(r: Rune): int =
+func width*(r: Rune): int =
   let ucs = int(r)
   # sorted list of non-overlapping intervals of non-spacing characters
   # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
 
-  # test for 8-bit control characters
-  if ucs == 0:
-    return 0
-  if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0):
-    return -1
-
   # binary search in table of non-spacing characters
   if bisearch(r, combining):
     return 0
 
+  if r.isControlChar():
+    return 2
+
   # if we arrive here, ucs is not a combining or C0/C1 control character
 
   if (ucs >= 0x1100 and
@@ -224,11 +259,13 @@ func mk_wcwidth(r: Rune): int =
     return 2
   return 1
 
+func width*(s: string): int =
+  for r in s.runes():
+    result += width(r)
 
-func mk_wcswidth(s: string): int =
-  for r in s.runes:
-    result += mk_wcwidth(r)
-
+func width*(s: seq[Rune]): int =
+  for r in s:
+    result += width(r)
 
 # 
 # The following functions are the same as mk_wcwidth() and
@@ -294,7 +331,7 @@ const ambiguous = [
   ( 0xFFFD, 0xFFFD ), ( 0xF0000, 0xFFFFD ), ( 0x100000, 0x10FFFD )
 ]
 
-func mk_wcwidth_cjk*(ucs: Rune): int =
+func mk_wcwidth_cjk(ucs: Rune): int =
   # sorted list of non-overlapping intervals of East Asian Ambiguous
   # characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c"
 
@@ -302,24 +339,14 @@ func mk_wcwidth_cjk*(ucs: Rune): int =
   if bisearch(ucs, ambiguous):
     return 2;
 
-  return mk_wcwidth(ucs);
+  return width(ucs);
 
+func mk_wcswidth_cjk(s: string): int =
+  for r in s.runes:
+    result += mk_wcwidth_cjk(r)
+  return result
 
-func mk_wcswidth_cjk*(s: string): int =
-  #result = 0
-  #for r in s.runes:
-  #  result += mk_wcwidth_cjk(r)
-  #return result
-  result = 0
-  var i = 0
-  while i < len(s):
-    var r: Rune
-    fastRuneAt(s, i, r, false)
-    if uint(s[i]) <= 127: inc(i)
-    elif uint(s[i]) shr 5 == 0b110: inc(i, 2)
-    elif uint(s[i]) shr 4 == 0b1110: inc(i, 3)
-    elif uint(s[i]) shr 3 == 0b11110: inc(i, 4)
-    elif uint(s[i]) shr 2 == 0b111110: inc(i, 5)
-    elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6)
-    else: inc i
-    inc(result, mk_wcwidth_cjk(r))
+
+proc skipBlanks*(buf: string, at: var int) =
+  while at < buf.len and buf[at].isWhitespace():
+    inc at
author	bptato <nincsnevem662@gmail.com>	2021-03-12 20:52:44 +0100
committer	bptato <nincsnevem662@gmail.com>	2021-03-12 20:53:18 +0100
commit	6084e104d5c3868196c9e2d3748c6627cf983470 (patch)
tree	f5103ffd0b33049e74b44f7a91448a22e3c8ef74 /twtstr.nim
parent	424f52e1f2eff84ee23a79b38e7932c2918351ed (diff)
download	chawan-6084e104d5c3868196c9e2d3748c6627cf983470.tar.gz