# We reuse the cell data structure for tokenization
# Token cells are special, though. They have no type, they're always atoms,
# they always have text-data.
fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
trace-text trace, "read", "tokenize"
trace-lower trace
rewind-gap-buffer in
var token-storage: cell
var token/edx: (addr cell) <- address token-storage
{
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
# initialize token data each iteration to avoid aliasing
var dest-ah/eax: (addr handle stream byte) <- get token, text-data
populate-stream dest-ah, 0x40/max-token-size
#
next-token in, token, trace
var error?/eax: boolean <- has-errors? trace
compare error?, 0/false
{
break-if-=
return
}
write-to-stream out, token # shallow-copy text-data
loop
}
trace-higher trace
}
fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
trace-text trace, "read", "next-token"
trace-lower trace
var out-cell/eax: (addr cell) <- copy _out-cell
var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data
var _out/eax: (addr stream byte) <- lookup *out-ah
var out/edi: (addr stream byte) <- copy _out
$next-token:body: {
clear-stream out
skip-whitespace-from-gap-buffer in
var g/eax: grapheme <- peek-from-gap-buffer in
{
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "read", stream
}
# digit
{
var digit?/eax: boolean <- is-decimal-digit? g
compare digit?, 0/false
break-if-=
next-number-token in, out, trace
break $next-token:body
}
# other symbol char
{
var symbol?/eax: boolean <- is-symbol-grapheme? g
compare symbol?, 0/false
break-if-=
next-symbol-token in, out, trace
break $next-token:body
}
# brackets are always single-char tokens
{
var bracket?/eax: boolean <- is-bracket-grapheme? g
compare bracket?, 0/false
break-if-=
var g/eax: grapheme <- read-from-gap-buffer in
next-bracket-token g, out, trace
break $next-token:body
}
}
trace-higher trace
var stream-storage: (stream byte 0x40)
var stream/eax: (addr stream byte) <- address stream-storage
write stream, "=> "
rewind-stream out
write-stream stream, out
trace trace, "read", stream
}
fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
trace-text trace, "read", "looking for a symbol"
trace-lower trace
$next-symbol-token:loop: {
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
var g/eax: grapheme <- peek-from-gap-buffer in
{
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "read", stream
}
# if non-symbol, return
{
var symbol-grapheme?/eax: boolean <- is-symbol-grapheme? g
compare symbol-grapheme?, 0/false
break-if-!=
trace-text trace, "read", "stop"
break $next-symbol-token:loop
}
var g/eax: grapheme <- read-from-gap-buffer in
write-grapheme out, g
loop
}
trace-higher trace
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "=> "
rewind-stream out
write-stream stream, out
trace trace, "read", stream
}
fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
trace-text trace, "read", "looking for a number"
trace-lower trace
$next-number-token:loop: {
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
var g/eax: grapheme <- peek-from-gap-buffer in
{
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "read", stream
}
# if not symbol grapheme, return
{
var symbol-grapheme?/eax: boolean <- is-symbol-grapheme? g
compare symbol-grapheme?, 0/false
break-if-!=
trace-text trace, "read", "stop"
break $next-number-token:loop
}
# if not digit grapheme, abort
{
var digit?/eax: boolean <- is-decimal-digit? g
compare digit?, 0/false
break-if-!=
error trace, "invalid number"
return
}
trace-text trace, "read", "append"
var g/eax: grapheme <- read-from-gap-buffer in
write-grapheme out, g
loop
}
trace-higher trace
}
fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
trace-text trace, "read", "bracket"
write-grapheme out, g
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "=> "
rewind-stream out
write-stream stream, out
trace trace, "read", stream
}
fn is-symbol-grapheme? g: grapheme -> _/eax: boolean {
## whitespace
compare g, 9/tab
{
break-if-!=
return 0/false
}
compare g, 0xa/newline
{
break-if-!=
return 0/false
}
compare g, 0x20/space
{
break-if-!=
return 0/false
}
## quotes
compare g, 0x22/double-quote
{
break-if-!=
return 0/false
}
compare g, 0x27/single-quote
{
break-if-!=
return 0/false
}
compare g, 0x60/backquote
{
break-if-!=
return 0/false
}
## brackets
compare g, 0x28/open-paren
{
break-if-!=
return 0/false
}
compare g, 0x29/close-paren
{
break-if-!=
return 0/false
}
compare g, 0x5b/open-square-bracket
{
break-if-!=
return 0/false
}
compare g, 0x5d/close-square-bracket
{
break-if-!=
return 0/false
}
compare g, 0x7b/open-curly-bracket
{
break-if-!=
return 0/false
}
compare g, 0x7d/close-curly-bracket
{
break-if-!=
return 0/false
}
# - other punctuation
# '!' is a symbol char
compare g, 0x23/hash
{
break-if-!=
return 0/false
}
# '$' is a symbol char
compare g, 0x25/percent
{
break-if-!=
return 0/false
}
compare g, 0x26/ampersand
{
break-if-!=
return 0/false
}
compare g, 0x2a/asterisk
{
break-if-!=
return 0/false
}
compare g, 0x2b/plus
{
break-if-!=
return 0/false
}
compare g, 0x2c/comma
{
break-if-!=
return 0/false
}
# '-' is a symbol char
compare g, 0x2e/period
{
break-if-!=
return 0/false
}
compare g, 0x2f/slash
{
break-if-!=
return 0/false
}
compare g, 0x2f/slash
{
break-if-!=
return 0/false
}
compare g, 0x3a/colon
{
break-if-!=
return 0/false
}
compare g, 0x3b/semi-colon
{
break-if-!=
return 0/false
}
compare g, 0x3c/less-than
{
break-if-!=
return 0/false
}
compare g, 0x3d/equal
{
break-if-!=
return 0/false
}
compare g, 0x3e/greater-than
{
break-if-!=
return 0/false
}
# '?' is a symbol char
compare g, 0x40/at-sign
{
break-if-!=
return 0/false
}
compare g, 0x5c/backslash
{
break-if-!=
return 0/false
}
compare g, 0x5e/caret
{
break-if-!=
return 0/false
}
# '_' is a symbol char
compare g, 0x7c/vertical-line
{
break-if-!=
return 0/false
}
compare g, 0x7e/tilde
{
break-if-!=
return 0/false
}
return 1/true
}
fn is-bracket-grapheme? g: grapheme -> _/eax: boolean {
compare g, 0x28/open-paren
{
break-if-!=
return 1/true
}
compare g, 0x29/close-paren
{
break-if-!=
return 1/true
}
compare g, 0x5b/open-square-bracket
{
break-if-!=
return 1/true
}
compare g, 0x5d/close-square-bracket
{
break-if-!=
return 1/true
}
compare g, 0x7b/open-curly-bracket
{
break-if-!=
return 1/true
}
compare g, 0x7d/close-curly-bracket
{
break-if-!=
return 1/true
}
return 0/false
}
fn is-number-token? _in: (addr cell) -> _/eax: boolean {
var in/eax: (addr cell) <- copy _in
var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
var in-data/eax: (addr stream byte) <- lookup *in-data-ah
rewind-stream in-data
var g/eax: grapheme <- read-grapheme in-data
var result/eax: boolean <- is-decimal-digit? g
return result
}