about summary refs log blame commit diff stats
path: root/403code-point.mu
blob: 3b2b7205eb0b1dbe72ea93e175aa1e2751d23370 (plain) (tree)

















































































































































































                                                                                     
# Helpers for Unicode "code points".
# https://en.wikipedia.org/wiki/Code_point
#
# Mu has no characters, only code points and graphemes.
# Code points are the indivisible atoms of text streams.
# Graphemes are the smallest self-contained unit of text.
# Graphemes may consist of multiple code points.
#
# Mu graphemes are always represented in utf-8, and they are required to fit
# in 4 bytes.
#
# Mu doesn't currently support combining code points, or graphemes made of
# multiple code points.

# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
fn to-grapheme in: code-point -> out/eax: grapheme {
$to-grapheme:body: {
  var c/eax: int <- copy in
  var num-trailers/ecx: int <- copy 0
  var first/edx: int <- copy 0
  $to-grapheme:compute-length: {
    # single byte: just return it
    compare c, 0x7f
    {
      break-if->
      out <- copy c
      break $to-grapheme:body
    }
    # 2 bytes
    compare c, 0x7ff
    {
      break-if->
      num-trailers <- copy 1
      first <- copy 0xc0
      break $to-grapheme:compute-length
    }
    # 3 bytes
    compare c, 0xffff
    {
      break-if->
      num-trailers <- copy 2
      first <- copy 0xe0
      break $to-grapheme:compute-length
    }
    # 4 bytes
    compare c, 0x1fffff
    {
      break-if->
      num-trailers <- copy 3
      first <- copy 0xf0
      break $to-grapheme:compute-length
    }
    # more than 4 bytes: unsupported
    compare c, 0x1fffff
    {
      break-if->
      print-string-to-real-screen "unsupported code point "
      print-int32-hex-to-real-screen c
      print-string-to-real-screen "\n"
      var exit-status/ebx: int <- copy 1
      syscall_exit
    }
  }
  # emit trailer bytes, 6 bits from 'in', first two bits '10'
  var byte-shifts/ebx: int <- copy 0
  var result/edi: int <- copy 0
  {
    compare num-trailers, 0
    break-if-<=
    var tmp/esi: int <- copy c
    tmp <- and 0x3f
    tmp <- or 0x80
    tmp <- shift-left-bytes tmp, byte-shifts
    result <- or tmp
    # update loop state
    c <- shift-right 6
    byte-shifts <- increment
    num-trailers <- decrement
    loop
  }
  # emit engine
  var tmp/esi: int <- copy c
  tmp <- or first
  tmp <- shift-left-bytes tmp, byte-shifts
  result <- or tmp
  #
  out <- copy result
}
}

# single-byte code point have identical graphemes
fn test-to-grapheme-single-byte {
  var in-int/ecx: int <- copy 0
  {
    compare in-int, 0x7f
    break-if->
    var in/eax: code-point <- copy in-int
    var out/eax: grapheme <- to-grapheme in
    var out-int/eax: int <- copy out
    check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
    in-int <- increment
    loop
  }
}

# smallest 2-byte utf-8
fn test-to-grapheme-two-bytes-min {
  var in/eax: code-point <- copy 0x80       #        10     000000
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0xc280, "F 2gr" # 110 00010  10 000000
}

# largest 2-byte utf-8
fn test-to-grapheme-two-bytes-max {
  var in/eax: code-point <- copy 0x7ff      #     11111     111111
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0xdfbf, "F 2gr" # 110 11111  10 111111
}

# smallest 3-byte utf-8
fn test-to-grapheme-three-bytes-min {
  var in/eax: code-point <- copy 0x800      #               100000     000000
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0xc280, "F 2gr" # 1110 0000  10 100000  10 000000
}

# needed because available primitives only shift by a literal/constant number of bits
fn shift-left-bytes n: int, k: int -> result/esi: int {
  var i/eax: int <- copy 0
  result <- copy n
  {
    compare i, k
    break-if->=
    compare i, 4  # only 4 bytes in 32 bits
    break-if->=
    result <- shift-left 8
    i <- increment
    loop
  }
}

fn test-shift-left-bytes-0 {
  var result/esi: int <- shift-left-bytes 1, 0
  check-ints-equal result, 1, "F - shift-left-bytes 0"
}

fn test-shift-left-bytes-1 {
  var result/esi: int <- shift-left-bytes 1, 1
  check-ints-equal result, 0x100, "F - shift-left-bytes 1"
}

fn test-shift-left-bytes-2 {
  var result/esi: int <- shift-left-bytes 1, 2
  check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
}

fn test-shift-left-bytes-3 {
  var result/esi: int <- shift-left-bytes 1, 3
  check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
}

fn test-shift-left-bytes-4 {
  var result/esi: int <- shift-left-bytes 1, 4
  check-ints-equal result, 0, "F - shift-left-bytes 4"
}

fn test-shift-left-bytes-5 {
  var result/esi: int <- shift-left-bytes 1, 5
  check-ints-equal result, 0, "F - shift-left-bytes >4"
}

#? fn main {
#?   run-tests
#? }