about summary refs log blame commit diff stats
path: root/shell/tokenize.mu
blob: b6ca1ffd873d65dfd014b176e160885f334134c8 (plain) (tree)
1
2
3
4
5
6
7
8
9
10



                                                                           





                                                                                 



                                                      



                                                                      






                                                
                                                        
















































                                                                                  







                                                           














































                                                                                          




































                                                                                            









































































                                                                                          





















































                                      




                              














                          
                                                    



                  
                        
































































































                                                       
 





































































































                                                        








                                                                    





































                                                                    
# We reuse the cell data structure for tokenization
# Token cells are special, though. They have no type, they're always atoms,
# they always have text-data.

fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
  trace-text trace, "read", "tokenize"
  trace-lower trace
  rewind-gap-buffer in
  var token-storage: cell
  var token/edx: (addr cell) <- address token-storage
  {
    var done?/eax: boolean <- gap-buffer-scan-done? in
    compare done?, 0/false
    break-if-!=
    # initialize token data each iteration to avoid aliasing
    var dest-ah/eax: (addr handle stream byte) <- get token, text-data
    populate-stream dest-ah, 0x40/max-token-size
    #
    next-token in, token, trace
    var error?/eax: boolean <- has-errors? trace
    compare error?, 0/false
    {
      break-if-=
      return
    }
    write-to-stream out, token  # shallow-copy text-data
    loop
  }
  trace-higher trace
}

fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
  trace-text trace, "read", "next-token"
  trace-lower trace
  var out-cell/eax: (addr cell) <- copy _out-cell
  var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data
  var _out/eax: (addr stream byte) <- lookup *out-ah
  var out/edi: (addr stream byte) <- copy _out
  $next-token:body: {
    clear-stream out
    skip-whitespace-from-gap-buffer in
    var g/eax: grapheme <- peek-from-gap-buffer in
    {
      var stream-storage: (stream byte 0x40)
      var stream/esi: (addr stream byte) <- address stream-storage
      write stream, "next: "
      var gval/eax: int <- copy g
      write-int32-hex stream, gval
      trace trace, "read", stream
    }
    # digit
    {
      var digit?/eax: boolean <- is-decimal-digit? g
      compare digit?, 0/false
      break-if-=
      next-number-token in, out, trace
      break $next-token:body
    }
    # other symbol char
    {
      var symbol?/eax: boolean <- is-symbol-grapheme? g
      compare symbol?, 0/false
      break-if-=
      next-symbol-token in, out, trace
      break $next-token:body
    }
    # brackets are always single-char tokens
    {
      var bracket?/eax: boolean <- is-bracket-grapheme? g
      compare bracket?, 0/false
      break-if-=
      var g/eax: grapheme <- read-from-gap-buffer in
      next-bracket-token g, out, trace
      break $next-token:body
    }
    # non-symbol operators
    {
      var operator?/eax: boolean <- is-operator-grapheme? g
      compare operator?, 0/false
      break-if-=
      next-operator-token in, out, trace
      break $next-token:body
    }
  }
  trace-higher trace
  var stream-storage: (stream byte 0x40)
  var stream/eax: (addr stream byte) <- address stream-storage
  write stream, "=> "
  rewind-stream out
  write-stream stream, out
  trace trace, "read", stream
}

fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
  trace-text trace, "read", "looking for a symbol"
  trace-lower trace
  $next-symbol-token:loop: {
    var done?/eax: boolean <- gap-buffer-scan-done? in
    compare done?, 0/false
    break-if-!=
    var g/eax: grapheme <- peek-from-gap-buffer in
    {
      var stream-storage: (stream byte 0x40)
      var stream/esi: (addr stream byte) <- address stream-storage
      write stream, "next: "
      var gval/eax: int <- copy g
      write-int32-hex stream, gval
      trace trace, "read", stream
    }
    # if non-symbol, return
    {
      var symbol-grapheme?/eax: boolean <- is-symbol-grapheme? g
      compare symbol-grapheme?, 0/false
      break-if-!=
      trace-text trace, "read", "stop"
      break $next-symbol-token:loop
    }
    var g/eax: grapheme <- read-from-gap-buffer in
    write-grapheme out, g
    loop
  }
  trace-higher trace
  var stream-storage: (stream byte 0x40)
  var stream/esi: (addr stream byte) <- address stream-storage
  write stream, "=> "
  rewind-stream out
  write-stream stream, out
  trace trace, "read", stream
}

fn next-operator-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
  trace-text trace, "read", "looking for a operator"
  trace-lower trace
  $next-operator-token:loop: {
    var done?/eax: boolean <- gap-buffer-scan-done? in
    compare done?, 0/false
    break-if-!=
    var g/eax: grapheme <- peek-from-gap-buffer in
    {
      var stream-storage: (stream byte 0x40)
      var stream/esi: (addr stream byte) <- address stream-storage
      write stream, "next: "
      var gval/eax: int <- copy g
      write-int32-hex stream, gval
      trace trace, "read", stream
    }
    # if non-operator, return
    {
      var operator-grapheme?/eax: boolean <- is-operator-grapheme? g
      compare operator-grapheme?, 0/false
      break-if-!=
      trace-text trace, "read", "stop"
      break $next-operator-token:loop
    }
    var g/eax: grapheme <- read-from-gap-buffer in
    write-grapheme out, g
    loop
  }
  trace-higher trace
  var stream-storage: (stream byte 0x40)
  var stream/esi: (addr stream byte) <- address stream-storage
  write stream, "=> "
  rewind-stream out
  write-stream stream, out
  trace trace, "read", stream
}

fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
  trace-text trace, "read", "looking for a number"
  trace-lower trace
  $next-number-token:loop: {
    var done?/eax: boolean <- gap-buffer-scan-done? in
    compare done?, 0/false
    break-if-!=
    var g/eax: grapheme <- peek-from-gap-buffer in
    {
      var stream-storage: (stream byte 0x40)
      var stream/esi: (addr stream byte) <- address stream-storage
      write stream, "next: "
      var gval/eax: int <- copy g
      write-int32-hex stream, gval
      trace trace, "read", stream
    }
    # if not symbol grapheme, return
    {
      var symbol-grapheme?/eax: boolean <- is-symbol-grapheme? g
      compare symbol-grapheme?, 0/false
      break-if-!=
      trace-text trace, "read", "stop"
      break $next-number-token:loop
    }
    # if not digit grapheme, abort
    {
      var digit?/eax: boolean <- is-decimal-digit? g
      compare digit?, 0/false
      break-if-!=
      error trace, "invalid number"
      return
    }
    trace-text trace, "read", "append"
    var g/eax: grapheme <- read-from-gap-buffer in
    write-grapheme out, g
    loop
  }
  trace-higher trace
}

fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
  trace-text trace, "read", "bracket"
  write-grapheme out, g
  var stream-storage: (stream byte 0x40)
  var stream/esi: (addr stream byte) <- address stream-storage
  write stream, "=> "
  rewind-stream out
  write-stream stream, out
  trace trace, "read", stream
}

fn is-symbol-grapheme? g: grapheme -> _/eax: boolean {
  ## whitespace
  compare g, 9/tab
  {
    break-if-!=
    return 0/false
  }
  compare g, 0xa/newline
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x20/space
  {
    break-if-!=
    return 0/false
  }
  ## quotes
  compare g, 0x22/double-quote
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x60/backquote
  {
    break-if-!=
    return 0/false
  }
  ## brackets
  compare g, 0x28/open-paren
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x29/close-paren
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x5b/open-square-bracket
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x5d/close-square-bracket
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x7b/open-curly-bracket
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x7d/close-curly-bracket
  {
    break-if-!=
    return 0/false
  }
  # - other punctuation
  # '!' is a symbol char
  compare g, 0x23/hash
  {
    break-if-!=
    return 0/false
  }
  # '$' is a symbol char
  compare g, 0x25/percent
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x26/ampersand
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x27/single-quote
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x2a/asterisk
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x2b/plus
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x2c/comma
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x2d/dash  # '-' not allowed in symbols
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x2e/period
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x2f/slash
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x3a/colon
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x3b/semi-colon
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x3c/less-than
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x3d/equal
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x3e/greater-than
  {
    break-if-!=
    return 0/false
  }
  # '?' is a symbol char
  compare g, 0x40/at-sign
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x5c/backslash
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x5e/caret
  {
    break-if-!=
    return 0/false
  }
  # '_' is a symbol char
  compare g, 0x7c/vertical-line
  {
    break-if-!=
    return 0/false
  }
  compare g, 0x7e/tilde
  {
    break-if-!=
    return 0/false
  }
  return 1/true
}

fn is-bracket-grapheme? g: grapheme -> _/eax: boolean {
  compare g, 0x28/open-paren
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x29/close-paren
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x5b/open-square-bracket
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x5d/close-square-bracket
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x7b/open-curly-bracket
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x7d/close-curly-bracket
  {
    break-if-!=
    return 1/true
  }
  return 0/false
}

fn is-operator-grapheme? g: grapheme -> _/eax: boolean {
  # '$' is a symbol char
  compare g, 0x25/percent
  {
    break-if-!=
    return 1/false
  }
  compare g, 0x26/ampersand
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x27/single-quote
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x2a/asterisk
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x2b/plus
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x2c/comma
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x2d/dash  # '-' not allowed in symbols
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x2e/period
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x2f/slash
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x3a/colon
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x3b/semi-colon
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x3c/less-than
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x3d/equal
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x3e/greater-than
  {
    break-if-!=
    return 1/true
  }
  # '?' is a symbol char
  compare g, 0x40/at-sign
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x5c/backslash
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x5e/caret
  {
    break-if-!=
    return 1/true
  }
  # '_' is a symbol char
  compare g, 0x7c/vertical-line
  {
    break-if-!=
    return 1/true
  }
  compare g, 0x7e/tilde
  {
    break-if-!=
    return 1/true
  }
  return 0/false
}

fn is-number-token? _in: (addr cell) -> _/eax: boolean {
  var in/eax: (addr cell) <- copy _in
  var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
  var in-data/eax: (addr stream byte) <- lookup *in-data-ah
  rewind-stream in-data
  var g/eax: grapheme <- read-grapheme in-data
  var result/eax: boolean <- is-decimal-digit? g
  return result
}

fn is-bracket-token? _in: (addr cell) -> _/eax: boolean {
  var in/eax: (addr cell) <- copy _in
  var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
  var in-data/eax: (addr stream byte) <- lookup *in-data-ah
  rewind-stream in-data
  var g/eax: grapheme <- read-grapheme in-data
  var result/eax: boolean <- is-bracket-grapheme? g
  return result
}

fn is-open-paren-token? _in: (addr cell) -> _/eax: boolean {
  var in/eax: (addr cell) <- copy _in
  var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
  var in-data/eax: (addr stream byte) <- lookup *in-data-ah
  rewind-stream in-data
  var g/eax: grapheme <- read-grapheme in-data
  compare g, 0x28/open-paren
  {
    break-if-!=
    return 1/true
  }
  return 0/false
}

fn is-close-paren-token? _in: (addr cell) -> _/eax: boolean {
  var in/eax: (addr cell) <- copy _in
  var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
  var in-data/eax: (addr stream byte) <- lookup *in-data-ah
  rewind-stream in-data
  var g/eax: grapheme <- read-grapheme in-data
  compare g, 0x29/open-paren
  {
    break-if-!=
    return 1/true
  }
  return 0/false
}