import std/algorithm import std/strutils import monoucha/libunicode import monoucha/libregexp import utils/charcategory import utils/twtuni proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} = return realloc(p, size) proc normalize*(rs: seq[uint32]; form = UNICODE_NFC): seq[uint32] = {.cast(noSideEffect).}: if rs.len == 0: return @[] var outbuf: ptr uint32 let p = cast[ptr uint32](unsafeAddr rs[0]) let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil, passRealloc) if out_len < 0: raise newException(Defect, "Unicode normalization failed") if out_len == 0: return var rs = newSeqUninitialized[uint32](out_len) copyMem(addr rs[0], outbuf, out_len * sizeof(uint32)) dealloc(outbuf) return rs proc mnormalize*(s: var string) = if NonAscii notin s: return # no need to normalize ascii s = s.toPoints().normalize().toUTF8() # n == 0: upper, 1: lower, 2: case fold proc toUpperLU(s: string; n: cint): string = result = newStringOfCap(s.len) for u in s.points: var outa: array[LRE_CC_RES_LEN_MAX, uint32] let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), u, n) result.addUTF8(outa.toOpenArray(0, n - 1)) proc toUpperLU*(s: string): string = return s.toUpperLU(0) proc toLowerLU*(s: string): string = return s.toUpperLU(1) proc capitalizeLU*(s: string): string = result = newStringOfCap(s.len) var wordStart = true for u in s.points: if lre_is_space(u) == 1: wordStart = true result.addUTF8(u) elif wordStart: var outa: array[LRE_CC_RES_LEN_MAX, uint32] let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), u, 0) result.addUTF8(outa.toOpenArray(0, n - 1)) wordStart = false else: result.addUTF8(u) type u32pair* {.packed.} = object a: uint32 b: uint32 func cmpRange*(x: u32pair; y: uint32): int = if x.a > y: return 1 elif x.b < y: return -1 return 0 func contains(cr: CharRange; u: uint32): bool = let cps = cast[ptr UncheckedArray[u32pair]](cr.points) let L = cr.len div 2 - 1 return cps.toOpenArray(0, L).binarySearch(u, cmpRange) != -1 type LURangeType = enum lurLetter = "Letter" lurSeparator = "Separator" lurHan = "Han" lurHiragana = "Hiragana" lurKatakana = "Katakana" lurHangul = "Hangul" LUContextObj = object crs: array[LURangeType, CharRange] inited: set[LURangeType] LUContext* = ref LUContextObj proc `=destroy`*(ctx: var LUContextObj) = for lur, cr in ctx.crs.mpairs: if lur in ctx.inited: cr_free(addr cr) ctx.inited = {} proc initGeneralCategory(ctx: LUContext; lur: LURangeType) = if lur notin ctx.inited: let p = addr ctx.crs[lur] cr_init(p, nil, passRealloc) doAssert unicode_general_category(p, cstring($lur)) == 0 ctx.inited.incl(lur) proc initScript(ctx: LUContext; lur: LURangeType) = if lur notin ctx.inited: let p = addr ctx.crs[lur] cr_init(p, nil, passRealloc) doAssert unicode_script(p, cstring($lur), 0) == 0 ctx.inited.incl(lur) proc isAlphaLU*(ctx: LUContext; u: uint32): bool = ctx.initGeneralCategory(lurLetter) return u in ctx.crs[lurLetter] proc isWhiteSpaceLU*(ctx: LUContext; u: uint32): bool = ctx.initGeneralCategory(lurSeparator) return u in ctx.crs[lurSeparator] proc isHan*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHan) return u in ctx.crs[lurHan] proc isHiragana*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHiragana) return u in ctx.crs[lurHiragana] proc isKatakana*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurKatakana) return u in ctx.crs[lurKatakana] proc isHangul*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHangul) return u in ctx.crs[lurHangul]