blob: 6829ede5043c26e8c0b8a2f9648ad48bbe8e400e (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
#
#
# Nimrod's Runtime Library
# (c) Copyright 2006 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
type
TUniChar* = int32 ## type that can hold any Unicode character
TUniChar16* = int16 ##
template ones(n) = ((1 shl n)-1)
proc uniCharLen*(s: string): int =
## returns the number of Unicode characters of the string `s`.
var i = 0
while i < len(s):
if ord(s[i]) <= 127:
inc(i)
elif ord(s[i]) shr 5 == 0b110:
inc(i, 2)
elif ord(s[i]) shr 4 == 0b1110:
inc(i, 3)
elif ord(s[i]) shr 3 == 0b11110:
inc(i, 4)
else:
assert(false)
inc(result)
proc uniCharAt*(s: string, i: int): TUniChar =
if ord(s[i]) <= 127:
result = ord(s[i])
elif ord(s[i]) shr 5 == 0b110:
assert(ord(s[i+1]) shr 6 == 0b10)
result = (ord(s[i]) and ones(5)) shl 6 or (ord(s[i+1]) and ones(6))
elif ord(s[i]) shr 4 == 0b1110:
assert(ord(s[i+1]) shr 6 == 0b10)
assert(ord(s[i+2]) shr 6 == 0b10)
result = (ord(s[i]) and ones(4)) shl 12 or
(ord(s[i+1]) and ones(6)) shl 6 or
(ord(s[i+2]) and ones(6))
elif ord(s[i]) shr 3 == 0b11110:
assert(ord(s[i+1]) shr 6 == 0b10)
assert(ord(s[i+2]) shr 6 == 0b10)
assert(ord(s[i+3]) shr 6 == 0b10)
result = (ord(s[i]) and ones(3)) shl 18 or
(ord(s[i+1]) and ones(6)) shl 12 or
(ord(s[i+2]) and ones(6)) shl 6 or
(ord(s[i+3]) and ones(6))
else:
assert(false)
iterator unichars*(s: string): TUniChar =
## iterates over any unicode character of the string `s`. Fastest possible
## method.
var
i = 0
result: TUniChar
while i < len(s):
if ord(s[i]) <= 127:
result = ord(s[i])
inc(i)
elif ord(s[i]) shr 5 == 0b110:
result = (ord(s[i]) and ones(5)) shl 6 or (ord(s[i+1]) and ones(6))
inc(i, 2)
elif ord(s[i]) shr 4 == 0b1110:
result = (ord(s[i]) and ones(4)) shl 12 or
(ord(s[i+1]) and ones(6)) shl 6 or
(ord(s[i+2]) and ones(6))
inc(i, 3)
elif ord(s[i]) shr 3 == 0b11110:
result = (ord(s[i]) and ones(3)) shl 18 or
(ord(s[i+1]) and ones(6)) shl 12 or
(ord(s[i+2]) and ones(6)) shl 6 or
(ord(s[i+3]) and ones(6))
inc(i, 4)
else:
assert(false)
yield result
proc utf8toLocale*(s: string): string
proc localeToUtf8*(s: string): string
proc utf8toUtf16*(s: string): seq[TUniChar16]
proc utf8toUcs4*(s: string): seq[TUniChar] =
result = []
for u in unichars(s):
proc ucs4ToUtf8(s: seq[TUnichar]): string
proc utf16ToUtf8(s: seq[TUnichar16]): string
proc ucs4toUft16(s: seq[TUnichar]): seq[TUnichar16]
proc uft16toUcs4(s: seq[Tunichar16]): seq[TUnichar]
proc cmpUnicode*(a, b: string): int =
## treats `a` and `b` as UTF-8 strings and compares them. Returns:
## | < 0 iff a < b
## | > 0 iff a > b
## | == 0 iff a == b
## This routine is useful for sorting UTF-8 strings.
return -1
|