about summary refs log tree commit diff stats
path: root/src/utils/luwrap.nim
blob: 7ccb2c518ce233ae9d6a04f6c6f4586b15c75834 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import std/algorithm
import std/strutils
import std/unicode

import monoucha/libunicode
import utils/charcategory

proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} =
  return realloc(p, size)

proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
  {.cast(noSideEffect).}:
    if rs.len == 0:
      return @[]
    var outbuf: ptr uint32
    let p = cast[ptr uint32](unsafeAddr rs[0])
    let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil,
      passRealloc)
    if out_len < 0:
      raise newException(Defect, "Unicode normalization failed")
    if out_len == 0:
      return
    var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
    copyMem(addr rs[0], outbuf, out_len * sizeof(uint32))
    dealloc(outbuf)
    return rs

proc mnormalize*(s: var string) =
  if NonAscii notin s:
    return # no need to normalize ascii
  s = $s.toRunes().normalize()

# n == 0: upper, 1: lower, 2: case fold
proc toUpperLU(s: string; n: cint): string =
  result = newStringOfCap(s.len)
  for r in s.runes:
    var outa: array[LRE_CC_RES_LEN_MAX, uint32]
    let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
      uint32(r), n)
    for i in 0 ..< n:
      result &= $Rune(outa[i])

proc toUpperLU*(s: string): string =
  return s.toUpperLU(0)

proc toLowerLU*(s: string): string =
  return s.toUpperLU(1)

proc capitalizeLU*(s: string): string =
  result = newStringOfCap(s.len)
  var wordStart = true
  for r in s.runes:
    if uint32(r) < 256 and char(r) in AsciiWhitespace or
        lre_is_space_non_ascii(uint32(r)) == 1:
      wordStart = true
      result &= $r
    elif wordStart:
      var outa: array[LRE_CC_RES_LEN_MAX, uint32]
      let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
        uint32(r), 0)
      for i in 0 ..< n:
        result &= $Rune(outa[i])
      wordStart = false
    else:
      result &= $r

type u32pair* {.packed.} = object
  a: uint32
  b: uint32

func cmpRange*(x: u32pair; y: uint32): int =
  if x.a > y:
    return 1
  elif x.b < y:
    return -1
  return 0

func contains(cr: CharRange; r: Rune): bool =
  let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
  let L = cr.len div 2 - 1
  return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1

type
  LURangeType = enum
    lurLetter = "Letter"
    lurSeparator = "Separator"
    lurHan = "Han"
    lurHiragana = "Hiragana"
    lurKatakana = "Katakana"
    lurHangul = "Hangul"

  LUContextObj = object
    crs: array[LURangeType, CharRange]
    inited: set[LURangeType]

  LUContext* = ref LUContextObj

{.warning[Deprecated]: off.}:
  proc `=destroy`*(ctx: var LUContextObj) =
    for lur, cr in ctx.crs.mpairs:
      if lur in ctx.inited:
        cr_free(addr cr)
    ctx.inited = {}

proc initGeneralCategory(ctx: LUContext; lur: LURangeType) =
  if lur notin ctx.inited:
    let p = addr ctx.crs[lur]
    cr_init(p, nil, passRealloc)
    doAssert unicode_general_category(p, cstring($lur)) == 0
    ctx.inited.incl(lur)

proc initScript(ctx: LUContext; lur: LURangeType) =
  if lur notin ctx.inited:
    let p = addr ctx.crs[lur]
    cr_init(p, nil, passRealloc)
    doAssert unicode_script(p, cstring($lur), 0) == 0
    ctx.inited.incl(lur)

proc isAlphaLU*(ctx: LUContext; r: Rune): bool =
  ctx.initGeneralCategory(lurLetter)
  return r in ctx.crs[lurLetter]

proc isWhiteSpaceLU*(ctx: LUContext; r: Rune): bool =
  ctx.initGeneralCategory(lurSeparator)
  return r in ctx.crs[lurSeparator]

proc isHan*(ctx: LUContext; r: Rune): bool =
  ctx.initScript(lurHan)
  return r in ctx.crs[lurHan]

proc isHiragana*(ctx: LUContext; r: Rune): bool =
  ctx.initScript(lurHiragana)
  return r in ctx.crs[lurHiragana]

proc isKatakana*(ctx: LUContext; r: Rune): bool =
  ctx.initScript(lurKatakana)
  return r in ctx.crs[lurKatakana]

proc isHangul*(ctx: LUContext; r: Rune): bool =
  ctx.initScript(lurHangul)
  return r in ctx.crs[lurHangul]