diff options
Diffstat (limited to 'twtstr.nim')
-rw-r--r-- | twtstr.nim | 99 |
1 files changed, 63 insertions, 36 deletions
diff --git a/twtstr.nim b/twtstr.nim index 63854879..42a64333 100644 --- a/twtstr.nim +++ b/twtstr.nim @@ -48,8 +48,18 @@ func remove*(str: string, c: string): string = if rem != rune: result &= $rune +const ControlChars = {chr(0x00)..chr(0x1F), chr(0x7F)} + +const Whitespace = { ' ', '\n', '\r', '\t' } + +func isWhitespace*(c: char): bool = + return c in Whitespace + func isControlChar*(c: char): bool = - return c <= char(0x1F) or c == char(0x7F) + return c in ControlChars + +func isControlChar*(r: Rune): bool = + return int(r) <= int(high(char)) and char(r) in ControlChars func getControlChar*(c: char): char = if c >= 'a': @@ -84,11 +94,40 @@ func findChar*(str: string, c: Rune, start: int = 0): int = i = n return -1 +func getLowerChars*(): string = + result = "" + for i in 0..255: + if chr(i) >= 'A' and chr(i) <= 'Z': + result &= chr(i + 32) + else: + result &= chr(i) + +const lowerChars = getLowerChars() + +func tolower*(c: char): char = + return lowerChars[int(c)] + +const breakWord = [ + Rune('\n'), Rune('/'), Rune('\\'), Rune(' '), Rune('&'), Rune('='), + Rune('?'), Rune('.'), Rune(';') +] + +func breaksWord*(r: Rune): bool = + return r in breakWord + +func substr*(s: seq[Rune], i: int, j: int): seq[Rune] = + if s.len == 0: + return @[] + return s[min(high(s), i)..min(high(s), j - 1)] -#Measure length of rune. Transpiled from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c +func substr*(s: seq[Rune], i: int): seq[Rune] = + if i >= high(s) or s.len == 0: + return @[] + return s[min(high(s), i)..high(s)] + +#Measure length of rune. From https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c #auxiliary function for binary search in interval table -#TODO: use binary search from stdlib? func bisearch(ucs: Rune, table: openarray[(int, int)]): bool = var max = table.high var min = 0 @@ -107,14 +146,13 @@ func bisearch(ucs: Rune, table: openarray[(int, int)]): bool = return true return false - #The following two functions define the column width of an ISO 10646 #character as follows: # # - The null character (U+0000) has a column width of 0. # # - Other C0/C1 control characters and DEL will lead to a return -# value of -1. +# value of 2. # # - Non-spacing and enclosing combining characters (general # category code Mn or Me in the Unicode database) have a @@ -191,21 +229,18 @@ const combining = [ ( 0xE0100, 0xE01EF ) ] -func mk_wcwidth(r: Rune): int = +func width*(r: Rune): int = let ucs = int(r) # sorted list of non-overlapping intervals of non-spacing characters # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" - # test for 8-bit control characters - if ucs == 0: - return 0 - if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0): - return -1 - # binary search in table of non-spacing characters if bisearch(r, combining): return 0 + if r.isControlChar(): + return 2 + # if we arrive here, ucs is not a combining or C0/C1 control character if (ucs >= 0x1100 and @@ -224,11 +259,13 @@ func mk_wcwidth(r: Rune): int = return 2 return 1 +func width*(s: string): int = + for r in s.runes(): + result += width(r) -func mk_wcswidth(s: string): int = - for r in s.runes: - result += mk_wcwidth(r) - +func width*(s: seq[Rune]): int = + for r in s: + result += width(r) # # The following functions are the same as mk_wcwidth() and @@ -294,7 +331,7 @@ const ambiguous = [ ( 0xFFFD, 0xFFFD ), ( 0xF0000, 0xFFFFD ), ( 0x100000, 0x10FFFD ) ] -func mk_wcwidth_cjk*(ucs: Rune): int = +func mk_wcwidth_cjk(ucs: Rune): int = # sorted list of non-overlapping intervals of East Asian Ambiguous # characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" @@ -302,24 +339,14 @@ func mk_wcwidth_cjk*(ucs: Rune): int = if bisearch(ucs, ambiguous): return 2; - return mk_wcwidth(ucs); + return width(ucs); +func mk_wcswidth_cjk(s: string): int = + for r in s.runes: + result += mk_wcwidth_cjk(r) + return result -func mk_wcswidth_cjk*(s: string): int = - #result = 0 - #for r in s.runes: - # result += mk_wcwidth_cjk(r) - #return result - result = 0 - var i = 0 - while i < len(s): - var r: Rune - fastRuneAt(s, i, r, false) - if uint(s[i]) <= 127: inc(i) - elif uint(s[i]) shr 5 == 0b110: inc(i, 2) - elif uint(s[i]) shr 4 == 0b1110: inc(i, 3) - elif uint(s[i]) shr 3 == 0b11110: inc(i, 4) - elif uint(s[i]) shr 2 == 0b111110: inc(i, 5) - elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6) - else: inc i - inc(result, mk_wcwidth_cjk(r)) + +proc skipBlanks*(buf: string, at: var int) = + while at < buf.len and buf[at].isWhitespace(): + inc at |