Treat CJK Ideographs as letters in `isAlpha()` (#23651)

Because of the bug in `tools/parse_unicodedata.nim`, CJK Ideographs were not considered letters in `isAlpha()`, even though they have category Lo. This is because they are specified as range in `UnicodeData.txt`, not as separate characters: ``` 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; ``` The parser was not prepared to parse such ranges and thus omitted almost all CJK Ideographs from consideration. To fix this, we need to consider ranges from `UnicodeData.txt` in `tools/parse_unicodedata.nim`.
author: Alexander Kernozhitsky <sh200105@mail.ru> 2024-05-29 06:42:07 +0200
committer: GitHub <noreply@github.com> 2024-05-29 06:42:07 +0200
commit: b172b34a245959c7d5e8f4df3c3dcbe88b7ba6fa (patch)
tree: d12583026fdc68f64ba4c3af04a56b2f942637b3 /tools
parent: d923c581c118b9ea891785bbb828c3cdede587b4 (diff)
download: Nim-b172b34a245959c7d5e8f4df3c3dcbe88b7ba6fa.tar.gz
1 files changed, 45 insertions, 25 deletions
diff --git a/tools/unicode_parsedata.nim b/tools/unicode_parsedata.nim
index cca377f51..bd12998d1 100644
--- a/tools/unicode_parsedata.nim
+++ b/tools/unicode_parsedata.nim
@@ -26,34 +26,54 @@ var
 
 
 proc parseData(data: seq[string]) =
-  for line in data:
-    let
-      fields = line.split(';')
-      code = fields[0].parseHexInt()
-      category = fields[2]
-      uc = fields[12]
-      lc = fields[13]
-      tc = fields[14]
-
+  proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) =
     if category notin spaces and category notin letters:
-      continue
+      return
 
+    if firstCode != lastCode:
+      doAssert uc == "" and lc == "" and tc == ""
     if uc.len > 0:
-      let diff = 500 + uc.parseHexInt() - code
-      toUpper.add (code, diff)
+      let diff = 500 + uc.parseHexInt() - firstCode
+      toUpper.add (firstCode, diff)
     if lc.len > 0:
-      let diff = 500 + lc.parseHexInt() - code
-      toLower.add (code, diff)
+      let diff = 500 + lc.parseHexInt() - firstCode
+      toLower.add (firstCode, diff)
     if tc.len > 0 and tc != uc:
       # if titlecase is different than uppercase
-      let diff = 500 + tc.parseHexInt() - code
+      let diff = 500 + tc.parseHexInt() - firstCode
       if diff != 500:
-        toTitle.add (code, diff)
+        toTitle.add (firstCode, diff)
 
-    if category in spaces:
-      unispaces.add code
+    for code in firstCode..lastCode:
+      if category in spaces:
+        unispaces.add code
+      else:
+        alphas.add code
+
+  var idx = 0
+  while idx < data.len:
+    let
+      line = data[idx]
+      fields = line.split(';')
+      code = fields[0].parseHexInt()
+      name = fields[1]
+      category = fields[2]
+      uc = fields[12]
+      lc = fields[13]
+      tc = fields[14]
+    inc(idx)
+    if name.endsWith(", First>"):
+      doAssert idx < data.len
+      let
+        nextLine = data[idx]
+        nextFields = nextLine.split(';')
+        nextCode = nextFields[0].parseHexInt()
+        nextName = nextFields[1]
+      inc(idx)
+      doAssert nextName.endsWith(", Last>")
+      doAdd(code, nextCode, category, uc, lc, tc)
     else:
-      alphas.add code
+      doAdd(code, code, category, uc, lc, tc)
 
 proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
   ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
@@ -153,18 +173,18 @@ proc createHeader(output: var string) =
 
 proc `$`(r: Ranges): string =
   let
-    start = "0x" & toHex(r.start, 5)
-    stop = "0x" & toHex(r.stop, 5)
+    start = "0x" & toHex(r.start, 5) & "'i32"
+    stop = "0x" & toHex(r.stop, 5) & "'i32"
   result = "$#, $#, $#,\n" % [start, stop, $r.diff]
 
 proc `$`(r: Singlets): string =
-  let code = "0x" & toHex(r.code, 5)
+  let code = "0x" & toHex(r.code, 5) & "'i32"
   result = "$#, $#,\n" % [code, $r.diff]
 
 proc `$`(r: NonLetterRanges): string =
   let
-    start = "0x" & toHex(r.start, 5)
-    stop = "0x" & toHex(r.stop, 5)
+    start = "0x" & toHex(r.start, 5) & "'i32"
+    stop = "0x" & toHex(r.stop, 5) & "'i32"
   result = "$#, $#,\n" % [start, stop]
 
 
@@ -178,7 +198,7 @@ proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
 proc outputSeq(s: seq[int], name: string, output: var string) =
   output.add "  $# = [\n" % name
   for i in s:
-    output.add "    0x$#,\n" % toHex(i, 5)
+    output.add "    0x$#'i32,\n" % toHex(i, 5)
   output.add "  ]\n\n"
 
 proc outputSpaces(s: seq[int], name: string, output: var string) =
author	Alexander Kernozhitsky <sh200105@mail.ru>	2024-05-29 06:42:07 +0200
committer	GitHub <noreply@github.com>	2024-05-29 06:42:07 +0200
commit	b172b34a245959c7d5e8f4df3c3dcbe88b7ba6fa (patch)
tree	d12583026fdc68f64ba4c3af04a56b2f942637b3 /tools
parent	d923c581c118b9ea891785bbb828c3cdede587b4 (diff)
download	Nim-b172b34a245959c7d5e8f4df3c3dcbe88b7ba6fa.tar.gz