diff options
author | Kartik K. Agaram <vc@akkartik.com> | 2021-03-03 22:09:50 -0800 |
---|---|---|
committer | Kartik K. Agaram <vc@akkartik.com> | 2021-03-03 22:21:03 -0800 |
commit | 71e4f3812982dba2efb471283d310224e8db363e (patch) | |
tree | ea111a1acb8b8845dbda39c0e1b4bac1d198143b /shell/tokenize.mu | |
parent | c6b928be29ac8cdb4e4d6e1eaa20420ff03e5a4c (diff) | |
download | mu-71e4f3812982dba2efb471283d310224e8db363e.tar.gz |
7842 - new directory organization
Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at.
Diffstat (limited to 'shell/tokenize.mu')
-rw-r--r-- | shell/tokenize.mu | 422 |
1 files changed, 422 insertions, 0 deletions
diff --git a/shell/tokenize.mu b/shell/tokenize.mu new file mode 100644 index 00000000..7beedf23 --- /dev/null +++ b/shell/tokenize.mu @@ -0,0 +1,422 @@ +# We reuse the cell data structure for tokenization +# Token cells are special, though. They have no type, they're always atoms, +# they always have text-data. + +fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) { + trace-text trace, "read", "tokenize" + trace-lower trace + rewind-gap-buffer in + var token-storage: cell + var token/edx: (addr cell) <- address token-storage + { + var done?/eax: boolean <- gap-buffer-scan-done? in + compare done?, 0/false + break-if-!= + # initialize token data each iteration to avoid aliasing + var dest-ah/eax: (addr handle stream byte) <- get token, text-data + populate-stream dest-ah, 0x40/max-token-size + # + next-token in, token, trace + var error?/eax: boolean <- has-errors? trace + compare error?, 0/false + { + break-if-= + return + } + write-to-stream out, token # shallow-copy text-data + loop + } + trace-higher trace +} + +fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) { + trace-text trace, "read", "next-token" + trace-lower trace + var out-cell/eax: (addr cell) <- copy _out-cell + var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data + var _out/eax: (addr stream byte) <- lookup *out-ah + var out/edi: (addr stream byte) <- copy _out + $next-token:body: { + clear-stream out + skip-whitespace-from-gap-buffer in + var g/eax: grapheme <- peek-from-gap-buffer in + { + var stream-storage: (stream byte 0x40) + var stream/esi: (addr stream byte) <- address stream-storage + write stream, "next: " + var gval/eax: int <- copy g + write-int32-hex stream, gval + trace trace, "read", stream + } + # digit + { + var digit?/eax: boolean <- is-decimal-digit? g + compare digit?, 0/false + break-if-= + next-number-token in, out, trace + break $next-token:body + } + # other symbol char + { + var symbol?/eax: boolean <- is-symbol-grapheme? g + compare symbol?, 0/false + break-if-= + next-symbol-token in, out, trace + break $next-token:body + } + # brackets are always single-char tokens + { + var bracket?/eax: boolean <- is-bracket-grapheme? g + compare bracket?, 0/false + break-if-= + var g/eax: grapheme <- read-from-gap-buffer in + next-bracket-token g, out, trace + break $next-token:body + } + } + trace-higher trace + var stream-storage: (stream byte 0x40) + var stream/eax: (addr stream byte) <- address stream-storage + write stream, "=> " + rewind-stream out + write-stream stream, out + trace trace, "read", stream +} + +fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) { + trace-text trace, "read", "looking for a symbol" + trace-lower trace + $next-symbol-token:loop: { + var done?/eax: boolean <- gap-buffer-scan-done? in + compare done?, 0/false + break-if-!= + var g/eax: grapheme <- peek-from-gap-buffer in + { + var stream-storage: (stream byte 0x40) + var stream/esi: (addr stream byte) <- address stream-storage + write stream, "next: " + var gval/eax: int <- copy g + write-int32-hex stream, gval + trace trace, "read", stream + } + # if non-symbol, return + { + var symbol-grapheme?/eax: boolean <- is-symbol-grapheme? g + compare symbol-grapheme?, 0/false + break-if-!= + trace-text trace, "read", "stop" + break $next-symbol-token:loop + } + var g/eax: grapheme <- read-from-gap-buffer in + write-grapheme out, g + loop + } + trace-higher trace + var stream-storage: (stream byte 0x40) + var stream/esi: (addr stream byte) <- address stream-storage + write stream, "=> " + rewind-stream out + write-stream stream, out + trace trace, "read", stream +} + +fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) { + trace-text trace, "read", "looking for a number" + trace-lower trace + $next-number-token:loop: { + var done?/eax: boolean <- gap-buffer-scan-done? in + compare done?, 0/false + break-if-!= + var g/eax: grapheme <- peek-from-gap-buffer in + { + var stream-storage: (stream byte 0x40) + var stream/esi: (addr stream byte) <- address stream-storage + write stream, "next: " + var gval/eax: int <- copy g + write-int32-hex stream, gval + trace trace, "read", stream + } + # if not symbol grapheme, return + { + var symbol-grapheme?/eax: boolean <- is-symbol-grapheme? g + compare symbol-grapheme?, 0/false + break-if-!= + trace-text trace, "read", "stop" + break $next-number-token:loop + } + # if not digit grapheme, abort + { + var digit?/eax: boolean <- is-decimal-digit? g + compare digit?, 0/false + break-if-!= + error trace, "invalid number" + return + } + trace-text trace, "read", "append" + var g/eax: grapheme <- read-from-gap-buffer in + write-grapheme out, g + loop + } + trace-higher trace +} + +fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) { + trace-text trace, "read", "bracket" + write-grapheme out, g + var stream-storage: (stream byte 0x40) + var stream/esi: (addr stream byte) <- address stream-storage + write stream, "=> " + rewind-stream out + write-stream stream, out + trace trace, "read", stream +} + +fn is-symbol-grapheme? g: grapheme -> _/eax: boolean { + ## whitespace + compare g, 9/tab + { + break-if-!= + return 0/false + } + compare g, 0xa/newline + { + break-if-!= + return 0/false + } + compare g, 0x20/space + { + break-if-!= + return 0/false + } + ## quotes + compare g, 0x22/double-quote + { + break-if-!= + return 0/false + } + compare g, 0x27/single-quote + { + break-if-!= + return 0/false + } + compare g, 0x60/backquote + { + break-if-!= + return 0/false + } + ## brackets + compare g, 0x28/open-paren + { + break-if-!= + return 0/false + } + compare g, 0x29/close-paren + { + break-if-!= + return 0/false + } + compare g, 0x5b/open-square-bracket + { + break-if-!= + return 0/false + } + compare g, 0x5d/close-square-bracket + { + break-if-!= + return 0/false + } + compare g, 0x7b/open-curly-bracket + { + break-if-!= + return 0/false + } + compare g, 0x7d/close-curly-bracket + { + break-if-!= + return 0/false + } + # - other punctuation + # '!' is a symbol char + compare g, 0x23/hash + { + break-if-!= + return 0/false + } + # '$' is a symbol char + compare g, 0x25/percent + { + break-if-!= + return 0/false + } + compare g, 0x26/ampersand + { + break-if-!= + return 0/false + } + compare g, 0x2a/asterisk + { + break-if-!= + return 0/false + } + compare g, 0x2b/plus + { + break-if-!= + return 0/false + } + compare g, 0x2c/comma + { + break-if-!= + return 0/false + } + # '-' is a symbol char + compare g, 0x2e/period + { + break-if-!= + return 0/false + } + compare g, 0x2f/slash + { + break-if-!= + return 0/false + } + compare g, 0x2f/slash + { + break-if-!= + return 0/false + } + compare g, 0x3a/colon + { + break-if-!= + return 0/false + } + compare g, 0x3b/semi-colon + { + break-if-!= + return 0/false + } + compare g, 0x3c/less-than + { + break-if-!= + return 0/false + } + compare g, 0x3d/equal + { + break-if-!= + return 0/false + } + compare g, 0x3e/greater-than + { + break-if-!= + return 0/false + } + # '?' is a symbol char + compare g, 0x40/at-sign + { + break-if-!= + return 0/false + } + compare g, 0x5c/backslash + { + break-if-!= + return 0/false + } + compare g, 0x5e/caret + { + break-if-!= + return 0/false + } + # '_' is a symbol char + compare g, 0x7c/vertical-line + { + break-if-!= + return 0/false + } + compare g, 0x7e/tilde + { + break-if-!= + return 0/false + } + return 1/true +} + +fn is-bracket-grapheme? g: grapheme -> _/eax: boolean { + compare g, 0x28/open-paren + { + break-if-!= + return 1/true + } + compare g, 0x29/close-paren + { + break-if-!= + return 1/true + } + compare g, 0x5b/open-square-bracket + { + break-if-!= + return 1/true + } + compare g, 0x5d/close-square-bracket + { + break-if-!= + return 1/true + } + compare g, 0x7b/open-curly-bracket + { + break-if-!= + return 1/true + } + compare g, 0x7d/close-curly-bracket + { + break-if-!= + return 1/true + } + return 0/false +} + +fn is-number-token? _in: (addr cell) -> _/eax: boolean { + var in/eax: (addr cell) <- copy _in + var in-data-ah/eax: (addr handle stream byte) <- get in, text-data + var in-data/eax: (addr stream byte) <- lookup *in-data-ah + rewind-stream in-data + var g/eax: grapheme <- read-grapheme in-data + var result/eax: boolean <- is-decimal-digit? g + return result +} + +fn is-bracket-token? _in: (addr cell) -> _/eax: boolean { + var in/eax: (addr cell) <- copy _in + var in-data-ah/eax: (addr handle stream byte) <- get in, text-data + var in-data/eax: (addr stream byte) <- lookup *in-data-ah + rewind-stream in-data + var g/eax: grapheme <- read-grapheme in-data + var result/eax: boolean <- is-bracket-grapheme? g + return result +} + +fn is-open-paren-token? _in: (addr cell) -> _/eax: boolean { + var in/eax: (addr cell) <- copy _in + var in-data-ah/eax: (addr handle stream byte) <- get in, text-data + var in-data/eax: (addr stream byte) <- lookup *in-data-ah + rewind-stream in-data + var g/eax: grapheme <- read-grapheme in-data + compare g, 0x28/open-paren + { + break-if-!= + return 1/true + } + return 0/false +} + +fn is-close-paren-token? _in: (addr cell) -> _/eax: boolean { + var in/eax: (addr cell) <- copy _in + var in-data-ah/eax: (addr handle stream byte) <- get in, text-data + var in-data/eax: (addr stream byte) <- lookup *in-data-ah + rewind-stream in-data + var g/eax: grapheme <- read-grapheme in-data + compare g, 0x29/open-paren + { + break-if-!= + return 1/true + } + return 0/false +} |