summary refs log tree commit diff stats
path: root/tools/unicode_parsedata.nim
diff options
context:
space:
mode:
Diffstat (limited to 'tools/unicode_parsedata.nim')
-rw-r--r--tools/unicode_parsedata.nim224
1 files changed, 224 insertions, 0 deletions
diff --git a/tools/unicode_parsedata.nim b/tools/unicode_parsedata.nim
new file mode 100644
index 000000000..bd12998d1
--- /dev/null
+++ b/tools/unicode_parsedata.nim
@@ -0,0 +1,224 @@
+import strutils, algorithm
+
+let
+  # this file was obtained from:
+  # https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
+  filename = "tools/UnicodeData.txt"
+  data = readFile(filename).strip.splitLines()
+
+const
+  # see the table here:
+  # https://www.unicode.org/reports/tr44/#GC_Values_Table
+  letters = ["Lu", "Ll", "Lt", "Lm", "Lo"]
+  spaces = ["Zs", "Zl", "Zp"]
+
+type
+  Ranges = tuple[start, stop, diff: int]
+  Singlets = tuple[code, diff: int]
+  NonLetterRanges = tuple[start, stop: int]
+
+var
+  toUpper = newSeq[Singlets]()
+  toLower = newSeq[Singlets]()
+  toTitle = newSeq[Singlets]()
+  alphas = newSeq[int]()
+  unispaces = newSeq[int]()
+
+
+proc parseData(data: seq[string]) =
+  proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) =
+    if category notin spaces and category notin letters:
+      return
+
+    if firstCode != lastCode:
+      doAssert uc == "" and lc == "" and tc == ""
+    if uc.len > 0:
+      let diff = 500 + uc.parseHexInt() - firstCode
+      toUpper.add (firstCode, diff)
+    if lc.len > 0:
+      let diff = 500 + lc.parseHexInt() - firstCode
+      toLower.add (firstCode, diff)
+    if tc.len > 0 and tc != uc:
+      # if titlecase is different than uppercase
+      let diff = 500 + tc.parseHexInt() - firstCode
+      if diff != 500:
+        toTitle.add (firstCode, diff)
+
+    for code in firstCode..lastCode:
+      if category in spaces:
+        unispaces.add code
+      else:
+        alphas.add code
+
+  var idx = 0
+  while idx < data.len:
+    let
+      line = data[idx]
+      fields = line.split(';')
+      code = fields[0].parseHexInt()
+      name = fields[1]
+      category = fields[2]
+      uc = fields[12]
+      lc = fields[13]
+      tc = fields[14]
+    inc(idx)
+    if name.endsWith(", First>"):
+      doAssert idx < data.len
+      let
+        nextLine = data[idx]
+        nextFields = nextLine.split(';')
+        nextCode = nextFields[0].parseHexInt()
+        nextName = nextFields[1]
+      inc(idx)
+      doAssert nextName.endsWith(", Last>")
+      doAdd(code, nextCode, category, uc, lc, tc)
+    else:
+      doAdd(code, code, category, uc, lc, tc)
+
+proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
+  ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
+  ## - `r` contains continuous ranges with the same characteristics
+  ##   (their upper/lower version is the same distance away)
+  ## - `s` contains single code points
+  var i, j: int
+  while i < a.len:
+    j = 1
+    let
+      startCode = a[i].code
+      startDiff = a[i].diff
+    while i + j <= a.len:
+      if i+j >= a.len or a[i+j].code != startCode+j or a[i+j].diff != startDiff:
+        if j == 1:
+          s.add (startCode, startDiff)
+        else:
+          r.add (startCode, a[i+j-1].code, startDiff)
+        i += j-1
+        break
+      else:
+        inc j
+    inc i
+
+proc splitRanges(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
+  ## Splits `alphas` and `unispaces` into separate sequences:
+  ## - `r` contains continuous ranges
+  ## - `s` contains single code points
+  var i, j: int
+  while i < a.len:
+    j = 1
+    let startCode = a[i]
+    while i + j <= a.len:
+      if i+j >= a.len or a[i+j] != startCode+j:
+        if j == 1:
+          s.add startCode
+        else:
+          r.add (startCode, a[i+j-1])
+        i += j-1
+        break
+      else:
+        inc j
+    inc i
+
+proc splitSpaces(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
+  ## Spaces are special because of the way how `isWhiteSpace` and `split`
+  ## are implemented.
+  ##
+  ## All spaces are added both to `r` (ranges) and `s` (singlets).
+  var i, j: int
+  while i < a.len:
+    j = 1
+    let startCode = a[i]
+    while i + j <= a.len:
+      if i+j >= a.len or a[i+j] != startCode+j:
+        r.add (startCode, a[i+j-1])
+        i += j-1
+        break
+      else:
+        inc j
+    inc i
+  s = a
+
+
+var
+  toupperRanges = newSeq[Ranges]()
+  toupperSinglets = newSeq[Singlets]()
+  tolowerRanges = newSeq[Ranges]()
+  tolowerSinglets = newSeq[Singlets]()
+  totitleRanges = newSeq[Ranges]()
+  totitleSinglets = newSeq[Singlets]()
+  spaceRanges = newSeq[NonLetterRanges]()
+  unicodeSpaces = newSeq[int]()
+  alphaRanges = newSeq[NonLetterRanges]()
+  alphaSinglets = newSeq[int]()
+
+parseData(data)
+splitRanges(toLower, tolowerRanges, tolowerSinglets)
+splitRanges(toUpper, toUpperRanges, toUpperSinglets)
+splitRanges(toTitle, toTitleRanges, toTitleSinglets)
+splitRanges(alphas, alphaRanges, alphaSinglets)
+
+# manually add "special" spaces
+for i in 9 .. 13:
+  unispaces.add i
+unispaces.add 0x85
+unispaces.sort()
+
+splitSpaces(unispaces, spaceRanges, unicodeSpaces)
+
+
+var output: string
+
+proc createHeader(output: var string) =
+  output.add "# This file was created from a script.\n\n"
+  output.add "const\n"
+
+proc `$`(r: Ranges): string =
+  let
+    start = "0x" & toHex(r.start, 5) & "'i32"
+    stop = "0x" & toHex(r.stop, 5) & "'i32"
+  result = "$#, $#, $#,\n" % [start, stop, $r.diff]
+
+proc `$`(r: Singlets): string =
+  let code = "0x" & toHex(r.code, 5) & "'i32"
+  result = "$#, $#,\n" % [code, $r.diff]
+
+proc `$`(r: NonLetterRanges): string =
+  let
+    start = "0x" & toHex(r.start, 5) & "'i32"
+    stop = "0x" & toHex(r.stop, 5) & "'i32"
+  result = "$#, $#,\n" % [start, stop]
+
+
+proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
+               output: var string) =
+  output.add "  $# = [\n" % name
+  for r in s:
+    output.add "    " & $r
+  output.add "  ]\n\n"
+
+proc outputSeq(s: seq[int], name: string, output: var string) =
+  output.add "  $# = [\n" % name
+  for i in s:
+    output.add "    0x$#'i32,\n" % toHex(i, 5)
+  output.add "  ]\n\n"
+
+proc outputSpaces(s: seq[int], name: string, output: var string) =
+  output.add "  $# = [\n" % name
+  for i in s:
+    output.add "    Rune 0x$#,\n" % toHex(i, 5)
+  output.add "  ]\n\n"
+
+
+output.createHeader()
+outputSeq(tolowerRanges,   "toLowerRanges",   output)
+outputSeq(tolowerSinglets, "toLowerSinglets", output)
+outputSeq(toupperRanges,   "toUpperRanges",   output)
+outputSeq(toupperSinglets, "toUpperSinglets", output)
+outputSeq(totitleSinglets, "toTitleSinglets", output)
+outputSeq(alphaRanges,     "alphaRanges",     output)
+outputSeq(alphaSinglets,   "alphaSinglets",   output)
+outputSeq(spaceRanges,     "spaceRanges",     output)
+outputSpaces(unispaces,    "unicodeSpaces",   output) # array of runes
+
+
+let outfile = "lib/pure/includes/unicode_ranges.nim"
+outfile.writeFile(output)