1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
import std/algorithm
import std/strutils
import monoucha/libunicode
import utils/charcategory
import utils/twtuni
proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} =
return realloc(p, size)
proc normalize*(rs: seq[uint32]; form = UNICODE_NFC): seq[uint32] =
{.cast(noSideEffect).}:
if rs.len == 0:
return @[]
var outbuf: ptr uint32
let p = cast[ptr uint32](unsafeAddr rs[0])
let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil,
passRealloc)
if out_len < 0:
raise newException(Defect, "Unicode normalization failed")
if out_len == 0:
return
var rs = newSeqUninitialized[uint32](out_len)
copyMem(addr rs[0], outbuf, out_len * sizeof(uint32))
dealloc(outbuf)
return rs
proc mnormalize*(s: var string) =
if NonAscii notin s:
return # no need to normalize ascii
s = s.toPoints().normalize().toUTF8()
# n == 0: upper, 1: lower, 2: case fold
proc toUpperLU(s: string; n: cint): string =
result = newStringOfCap(s.len)
for u in s.points:
var outa: array[LRE_CC_RES_LEN_MAX, uint32]
let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), u, n)
result.addUTF8(outa.toOpenArray(0, n - 1))
proc toUpperLU*(s: string): string =
return s.toUpperLU(0)
proc toLowerLU*(s: string): string =
return s.toUpperLU(1)
proc capitalizeLU*(s: string): string =
result = newStringOfCap(s.len)
var wordStart = true
for u in s.points:
if lre_is_space(u) == 1:
wordStart = true
result.addUTF8(u)
elif wordStart:
var outa: array[LRE_CC_RES_LEN_MAX, uint32]
let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
u, 0)
result.addUTF8(outa.toOpenArray(0, n - 1))
wordStart = false
else:
result.addUTF8(u)
type u32pair* {.packed.} = object
a: uint32
b: uint32
func cmpRange*(x: u32pair; y: uint32): int =
if x.a > y:
return 1
elif x.b < y:
return -1
return 0
func contains(cr: CharRange; u: uint32): bool =
let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
let L = cr.len div 2 - 1
return cps.toOpenArray(0, L).binarySearch(u, cmpRange) != -1
type
LURangeType = enum
lurLetter = "Letter"
lurSeparator = "Separator"
lurHan = "Han"
lurHiragana = "Hiragana"
lurKatakana = "Katakana"
lurHangul = "Hangul"
LUContextObj = object
crs: array[LURangeType, CharRange]
inited: set[LURangeType]
LUContext* = ref LUContextObj
proc `=destroy`*(ctx: var LUContextObj) =
for lur, cr in ctx.crs.mpairs:
if lur in ctx.inited:
cr_free(addr cr)
ctx.inited = {}
proc initGeneralCategory(ctx: LUContext; lur: LURangeType) =
if lur notin ctx.inited:
let p = addr ctx.crs[lur]
cr_init(p, nil, passRealloc)
doAssert unicode_general_category(p, cstring($lur)) == 0
ctx.inited.incl(lur)
proc initScript(ctx: LUContext; lur: LURangeType) =
if lur notin ctx.inited:
let p = addr ctx.crs[lur]
cr_init(p, nil, passRealloc)
doAssert unicode_script(p, cstring($lur), 0) == 0
ctx.inited.incl(lur)
proc isAlphaLU*(ctx: LUContext; u: uint32): bool =
ctx.initGeneralCategory(lurLetter)
return u in ctx.crs[lurLetter]
proc isWhiteSpaceLU*(ctx: LUContext; u: uint32): bool =
ctx.initGeneralCategory(lurSeparator)
return u in ctx.crs[lurSeparator]
proc isHan*(ctx: LUContext; u: uint32): bool =
ctx.initScript(lurHan)
return u in ctx.crs[lurHan]
proc isHiragana*(ctx: LUContext; u: uint32): bool =
ctx.initScript(lurHiragana)
return u in ctx.crs[lurHiragana]
proc isKatakana*(ctx: LUContext; u: uint32): bool =
ctx.initScript(lurKatakana)
return u in ctx.crs[lurKatakana]
proc isHangul*(ctx: LUContext; u: uint32): bool =
ctx.initScript(lurHangul)
return u in ctx.crs[lurHangul]
|