diff options
Diffstat (limited to 'tools/unicode_parsedata.nim')
-rw-r--r-- | tools/unicode_parsedata.nim | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/tools/unicode_parsedata.nim b/tools/unicode_parsedata.nim new file mode 100644 index 000000000..bd12998d1 --- /dev/null +++ b/tools/unicode_parsedata.nim @@ -0,0 +1,224 @@ +import strutils, algorithm + +let + # this file was obtained from: + # https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + filename = "tools/UnicodeData.txt" + data = readFile(filename).strip.splitLines() + +const + # see the table here: + # https://www.unicode.org/reports/tr44/#GC_Values_Table + letters = ["Lu", "Ll", "Lt", "Lm", "Lo"] + spaces = ["Zs", "Zl", "Zp"] + +type + Ranges = tuple[start, stop, diff: int] + Singlets = tuple[code, diff: int] + NonLetterRanges = tuple[start, stop: int] + +var + toUpper = newSeq[Singlets]() + toLower = newSeq[Singlets]() + toTitle = newSeq[Singlets]() + alphas = newSeq[int]() + unispaces = newSeq[int]() + + +proc parseData(data: seq[string]) = + proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) = + if category notin spaces and category notin letters: + return + + if firstCode != lastCode: + doAssert uc == "" and lc == "" and tc == "" + if uc.len > 0: + let diff = 500 + uc.parseHexInt() - firstCode + toUpper.add (firstCode, diff) + if lc.len > 0: + let diff = 500 + lc.parseHexInt() - firstCode + toLower.add (firstCode, diff) + if tc.len > 0 and tc != uc: + # if titlecase is different than uppercase + let diff = 500 + tc.parseHexInt() - firstCode + if diff != 500: + toTitle.add (firstCode, diff) + + for code in firstCode..lastCode: + if category in spaces: + unispaces.add code + else: + alphas.add code + + var idx = 0 + while idx < data.len: + let + line = data[idx] + fields = line.split(';') + code = fields[0].parseHexInt() + name = fields[1] + category = fields[2] + uc = fields[12] + lc = fields[13] + tc = fields[14] + inc(idx) + if name.endsWith(", First>"): + doAssert idx < data.len + let + nextLine = data[idx] + nextFields = nextLine.split(';') + nextCode = nextFields[0].parseHexInt() + nextName = nextFields[1] + inc(idx) + doAssert nextName.endsWith(", Last>") + doAdd(code, nextCode, category, uc, lc, tc) + else: + doAdd(code, code, category, uc, lc, tc) + +proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) = + ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences: + ## - `r` contains continuous ranges with the same characteristics + ## (their upper/lower version is the same distance away) + ## - `s` contains single code points + var i, j: int + while i < a.len: + j = 1 + let + startCode = a[i].code + startDiff = a[i].diff + while i + j <= a.len: + if i+j >= a.len or a[i+j].code != startCode+j or a[i+j].diff != startDiff: + if j == 1: + s.add (startCode, startDiff) + else: + r.add (startCode, a[i+j-1].code, startDiff) + i += j-1 + break + else: + inc j + inc i + +proc splitRanges(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) = + ## Splits `alphas` and `unispaces` into separate sequences: + ## - `r` contains continuous ranges + ## - `s` contains single code points + var i, j: int + while i < a.len: + j = 1 + let startCode = a[i] + while i + j <= a.len: + if i+j >= a.len or a[i+j] != startCode+j: + if j == 1: + s.add startCode + else: + r.add (startCode, a[i+j-1]) + i += j-1 + break + else: + inc j + inc i + +proc splitSpaces(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) = + ## Spaces are special because of the way how `isWhiteSpace` and `split` + ## are implemented. + ## + ## All spaces are added both to `r` (ranges) and `s` (singlets). + var i, j: int + while i < a.len: + j = 1 + let startCode = a[i] + while i + j <= a.len: + if i+j >= a.len or a[i+j] != startCode+j: + r.add (startCode, a[i+j-1]) + i += j-1 + break + else: + inc j + inc i + s = a + + +var + toupperRanges = newSeq[Ranges]() + toupperSinglets = newSeq[Singlets]() + tolowerRanges = newSeq[Ranges]() + tolowerSinglets = newSeq[Singlets]() + totitleRanges = newSeq[Ranges]() + totitleSinglets = newSeq[Singlets]() + spaceRanges = newSeq[NonLetterRanges]() + unicodeSpaces = newSeq[int]() + alphaRanges = newSeq[NonLetterRanges]() + alphaSinglets = newSeq[int]() + +parseData(data) +splitRanges(toLower, tolowerRanges, tolowerSinglets) +splitRanges(toUpper, toUpperRanges, toUpperSinglets) +splitRanges(toTitle, toTitleRanges, toTitleSinglets) +splitRanges(alphas, alphaRanges, alphaSinglets) + +# manually add "special" spaces +for i in 9 .. 13: + unispaces.add i +unispaces.add 0x85 +unispaces.sort() + +splitSpaces(unispaces, spaceRanges, unicodeSpaces) + + +var output: string + +proc createHeader(output: var string) = + output.add "# This file was created from a script.\n\n" + output.add "const\n" + +proc `$`(r: Ranges): string = + let + start = "0x" & toHex(r.start, 5) & "'i32" + stop = "0x" & toHex(r.stop, 5) & "'i32" + result = "$#, $#, $#,\n" % [start, stop, $r.diff] + +proc `$`(r: Singlets): string = + let code = "0x" & toHex(r.code, 5) & "'i32" + result = "$#, $#,\n" % [code, $r.diff] + +proc `$`(r: NonLetterRanges): string = + let + start = "0x" & toHex(r.start, 5) & "'i32" + stop = "0x" & toHex(r.stop, 5) & "'i32" + result = "$#, $#,\n" % [start, stop] + + +proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string, + output: var string) = + output.add " $# = [\n" % name + for r in s: + output.add " " & $r + output.add " ]\n\n" + +proc outputSeq(s: seq[int], name: string, output: var string) = + output.add " $# = [\n" % name + for i in s: + output.add " 0x$#'i32,\n" % toHex(i, 5) + output.add " ]\n\n" + +proc outputSpaces(s: seq[int], name: string, output: var string) = + output.add " $# = [\n" % name + for i in s: + output.add " Rune 0x$#,\n" % toHex(i, 5) + output.add " ]\n\n" + + +output.createHeader() +outputSeq(tolowerRanges, "toLowerRanges", output) +outputSeq(tolowerSinglets, "toLowerSinglets", output) +outputSeq(toupperRanges, "toUpperRanges", output) +outputSeq(toupperSinglets, "toUpperSinglets", output) +outputSeq(totitleSinglets, "toTitleSinglets", output) +outputSeq(alphaRanges, "alphaRanges", output) +outputSeq(alphaSinglets, "alphaSinglets", output) +outputSeq(spaceRanges, "spaceRanges", output) +outputSpaces(unispaces, "unicodeSpaces", output) # array of runes + + +let outfile = "lib/pure/includes/unicode_ranges.nim" +outfile.writeFile(output) |