From c970190021a0aeb2b73558a9095f63a774f439ba Mon Sep 17 00:00:00 2001 From: "Kartik K. Agaram" Date: Mon, 30 Aug 2021 00:32:15 -0700 Subject: first rendering of non-latin script Open question: why does column 0 get cropped? The spacing also seems excessive. Are we taking up 3 grid points? --- 403unicode.mu | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- apps/ex14.mu | 27 ++++++++++++++++ 2 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 apps/ex14.mu diff --git a/403unicode.mu b/403unicode.mu index 69bd0959..134f8216 100644 --- a/403unicode.mu +++ b/403unicode.mu @@ -14,9 +14,88 @@ # multiple code points. One day we will. # https://en.wikipedia.org/wiki/Combining_character +fn test-unicode-serialization-and-deserialization { + var i/ebx: int <- copy 0 + var init?/esi: boolean <- copy 1/true + { + compare i, 0x10000 # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane + # but not emoji + break-if->= + var c/eax: code-point <- copy i + var _g/eax: grapheme <- to-grapheme c + var g/ecx: grapheme <- copy _g + var c2/eax: code-point <- to-code-point g + compare i, c2 + { + break-if-= + { + compare init?, 0/false + break-if-= + draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg + } + init? <- copy 0/false + draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg + draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg + { + var x/eax: int <- copy g + draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg + } + draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg + { + var x2/eax: int <- copy c2 + draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg + } + draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg + } + i <- add 0xf # to speed things up; ensure increment is not a power of 2 + loop + } +} + +# transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox fn to-code-point in: grapheme -> _/eax: code-point { - var g/eax: grapheme <- copy in - var result/eax: code-point <- copy g # TODO: support non-ASCII + var g/ebx: int <- copy in + # if single byte, just return it + { + compare g, 0xff + break-if-> + var result/eax: code-point <- copy g + return result + } + # + var len/edx: int <- grapheme-length in + # extract bits from first byte + var b/eax: byte <- copy-byte g + var result/edi: code-point <- copy b + { + compare len, 2 + break-if-!= + result <- and 0x1f + } + { + compare len, 3 + break-if-!= + result <- and 0x0f + } + { + compare len, 4 + break-if-!= + result <- and 0x07 + } + # extract bits from remaining bytes + g <- shift-right 8 + var i/ecx: int <- copy 1 + { + compare i, len + break-if->= + var b/eax: byte <- copy-byte g + b <- and 0x3f + result <- shift-left 6 + result <- or b + g <- shift-right 8 + i <- increment + loop + } return result } @@ -220,6 +299,25 @@ fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { return result } +fn grapheme-length g: grapheme -> _/edx: int { + { + compare g, 0xff + break-if-> + return 1 + } + { + compare g, 0xffff + break-if-> + return 2 + } + { + compare g, 0xffffff + break-if-> + return 3 + } + return 4 +} + # needed because available primitives only shift by a literal/constant number of bits fn shift-left-bytes n: int, k: int -> _/eax: int { var i/ecx: int <- copy 0 diff --git a/apps/ex14.mu b/apps/ex14.mu new file mode 100644 index 00000000..600aded5 --- /dev/null +++ b/apps/ex14.mu @@ -0,0 +1,27 @@ +# Unicode demo +# +# Mu can't read Unicode from keyboard yet, so we'll read from disk and print +# to screen. +# +# Steps for trying it out: +# 1. Translate this example into a disk image code.img. +# ./translate apps/ex14.mu +# 2. Build a second disk image data.img containing some Unicode text. +# dd if=/dev/zero of=data.img count=20160 +# echo 'நட' |dd of=data.img conv=notrunc +# 3. Run: +# qemu-system-i386 -hda code.img -hdb data.img +# +# Expected output: 'நட' in green near the top-left corner of screen +# +# Limitations: +# - Utf-8 is the one true encoding. +# - No keyboard support yet. +# - Just single-code-point graphemes so far. No combiner characters, etc. + +fn main screen: (addr screen), keyboard: (addr keyboard), data-disk: (addr disk) { + var text-storage: (stream byte 0x200) + var text/esi: (addr stream byte) <- address text-storage + load-sectors data-disk, 0/lba, 1/num-sectors, text + var dummy/eax: int <- draw-stream-rightward screen, text, 1/x 0x80/xmax 0/y, 0xa/fg, 0/bg +} -- cgit 1.4.1-2-gfad0