diff options
author | Hans Raaf <hara@oderwat.de> | 2015-03-27 23:31:12 +0100 |
---|---|---|
committer | Hans Raaf <hara@oderwat.de> | 2016-06-02 17:43:10 +0200 |
commit | ac6de565ec82c5cdd3bbc3d90dc72836e985eca8 (patch) | |
tree | 5d748ab7ebdfe2e85bc2a3b822d5e8515000b2f9 | |
parent | 1138cf5234674e7942abc6bf94e88d798fb4d0e0 (diff) | |
download | Nim-ac6de565ec82c5cdd3bbc3d90dc72836e985eca8.tar.gz |
More work in optimizing, names and added substr().
This is work in progress. I added an unicode substring. Tried to handle edgecases more consistent too.
-rw-r--r-- | lib/pure/unicode.nim | 46 |
1 files changed, 40 insertions, 6 deletions
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 7f44786e3..586111e37 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -183,25 +183,59 @@ proc `$`*(runes: seq[Rune]): string = result = "" for rune in runes: result.add(rune.toUTF8) -proc runeOffset*(s: string, pos:int): int = - ## Returns the byte position of unicode character at position in s +proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int = + ## Returns the byte position of unicode character + ## at position pos in s with an optional start byte position. + ## returns the special value -1 if it runs out of the string var i = 0 - o = 0 + o = start while i < pos: o += runeLenAt(s, o) + if o >= s.len: + return -1 + #raise newException(IndexError, "Position out of bounds") inc i - o + return o -proc rune*(s: string, pos:int): Rune = +proc runeAtPos*(s: string, pos: int): Rune = ## Returns the unicode character at position pos fastRuneAt(s, runeOffset(s, pos), result, false) -proc runeStr*(s: string, pos:int): string = +proc runeStrAtPos*(s: string, pos: Natural): string = ## Returns the unicode character at position pos as UTF8 String let o = runeOffset(s, pos) s[o.. (o+runeLenAt(s, o)-1)] +proc runeSubStr*(s: string, pos: int, len: int = int.high): string = + ## Returns the UTF-8 substring starting at codepoint pos + ## with len codepoints. If pos or len is negativ they count from + ## the end of the string. If len is not given it means the longest + ## possible string. This reensembles how substr() in PHP works. + if pos < 0: + # offset from the end could be optimized further + var o = runeLen(s) + pos + if o < 0: o = 0 + result = runeSubStr(s, o, len) + else: + let o = runeOffset(s, pos) + if o < 0: + result = "" + elif len == int.high: + result = s[o.. s.len-1] + elif len < 0: + # offset from the end could be optimized further + let e = runeLen(s) + len + if e <= 0: + result = "" + else: + result = s[o.. runeOffset(s, e)-1] + else: + var e = runeOffset(s, len, o) + if e < 0: + e = s.len + result = s[o.. e-1] + const alphaRanges = [ 0x00d8, 0x00f6, # - |