diff options
author | Alexander Kernozhitsky <sh200105@mail.ru> | 2024-05-29 06:42:07 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-29 06:42:07 +0200 |
commit | b172b34a245959c7d5e8f4df3c3dcbe88b7ba6fa (patch) | |
tree | d12583026fdc68f64ba4c3af04a56b2f942637b3 /tools | |
parent | d923c581c118b9ea891785bbb828c3cdede587b4 (diff) | |
download | Nim-b172b34a245959c7d5e8f4df3c3dcbe88b7ba6fa.tar.gz |
Treat CJK Ideographs as letters in `isAlpha()` (#23651)
Because of the bug in `tools/parse_unicodedata.nim`, CJK Ideographs were not considered letters in `isAlpha()`, even though they have category Lo. This is because they are specified as range in `UnicodeData.txt`, not as separate characters: ``` 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; ``` The parser was not prepared to parse such ranges and thus omitted almost all CJK Ideographs from consideration. To fix this, we need to consider ranges from `UnicodeData.txt` in `tools/parse_unicodedata.nim`.
Diffstat (limited to 'tools')
-rw-r--r-- | tools/unicode_parsedata.nim | 70 |
1 files changed, 45 insertions, 25 deletions
diff --git a/tools/unicode_parsedata.nim b/tools/unicode_parsedata.nim index cca377f51..bd12998d1 100644 --- a/tools/unicode_parsedata.nim +++ b/tools/unicode_parsedata.nim @@ -26,34 +26,54 @@ var proc parseData(data: seq[string]) = - for line in data: - let - fields = line.split(';') - code = fields[0].parseHexInt() - category = fields[2] - uc = fields[12] - lc = fields[13] - tc = fields[14] - + proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) = if category notin spaces and category notin letters: - continue + return + if firstCode != lastCode: + doAssert uc == "" and lc == "" and tc == "" if uc.len > 0: - let diff = 500 + uc.parseHexInt() - code - toUpper.add (code, diff) + let diff = 500 + uc.parseHexInt() - firstCode + toUpper.add (firstCode, diff) if lc.len > 0: - let diff = 500 + lc.parseHexInt() - code - toLower.add (code, diff) + let diff = 500 + lc.parseHexInt() - firstCode + toLower.add (firstCode, diff) if tc.len > 0 and tc != uc: # if titlecase is different than uppercase - let diff = 500 + tc.parseHexInt() - code + let diff = 500 + tc.parseHexInt() - firstCode if diff != 500: - toTitle.add (code, diff) + toTitle.add (firstCode, diff) - if category in spaces: - unispaces.add code + for code in firstCode..lastCode: + if category in spaces: + unispaces.add code + else: + alphas.add code + + var idx = 0 + while idx < data.len: + let + line = data[idx] + fields = line.split(';') + code = fields[0].parseHexInt() + name = fields[1] + category = fields[2] + uc = fields[12] + lc = fields[13] + tc = fields[14] + inc(idx) + if name.endsWith(", First>"): + doAssert idx < data.len + let + nextLine = data[idx] + nextFields = nextLine.split(';') + nextCode = nextFields[0].parseHexInt() + nextName = nextFields[1] + inc(idx) + doAssert nextName.endsWith(", Last>") + doAdd(code, nextCode, category, uc, lc, tc) else: - alphas.add code + doAdd(code, code, category, uc, lc, tc) proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) = ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences: @@ -153,18 +173,18 @@ proc createHeader(output: var string) = proc `$`(r: Ranges): string = let - start = "0x" & toHex(r.start, 5) - stop = "0x" & toHex(r.stop, 5) + start = "0x" & toHex(r.start, 5) & "'i32" + stop = "0x" & toHex(r.stop, 5) & "'i32" result = "$#, $#, $#,\n" % [start, stop, $r.diff] proc `$`(r: Singlets): string = - let code = "0x" & toHex(r.code, 5) + let code = "0x" & toHex(r.code, 5) & "'i32" result = "$#, $#,\n" % [code, $r.diff] proc `$`(r: NonLetterRanges): string = let - start = "0x" & toHex(r.start, 5) - stop = "0x" & toHex(r.stop, 5) + start = "0x" & toHex(r.start, 5) & "'i32" + stop = "0x" & toHex(r.stop, 5) & "'i32" result = "$#, $#,\n" % [start, stop] @@ -178,7 +198,7 @@ proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string, proc outputSeq(s: seq[int], name: string, output: var string) = output.add " $# = [\n" % name for i in s: - output.add " 0x$#,\n" % toHex(i, 5) + output.add " 0x$#'i32,\n" % toHex(i, 5) output.add " ]\n\n" proc outputSpaces(s: seq[int], name: string, output: var string) = |