From 0436ab71eab8768d643d9c8568bdfef1ecc7079b Mon Sep 17 00:00:00 2001 From: "Kartik K. Agaram" Date: Tue, 22 Jun 2021 21:43:44 -0700 Subject: clean up lexical categories --- mu-init.subx | 10 ++++++ shell/infix.mu | 104 ++++-------------------------------------------------- shell/tokenize.mu | 86 ++++++++++++++++++++++---------------------- 3 files changed, 58 insertions(+), 142 deletions(-) diff --git a/mu-init.subx b/mu-init.subx index 26accb8a..ddae2bae 100644 --- a/mu-init.subx +++ b/mu-init.subx @@ -15,6 +15,16 @@ Entry: # #? (main 0 0 Primary-bus-secondary-drive) (set-cursor-position 0 0x30 2) + (test-tokenize-number) + (test-tokenize-negative-number) + (test-tokenize-quote) + (test-tokenize-backquote) + (test-tokenize-unquote) + (test-tokenize-unquote-splice) + (test-tokenize-dotted-list) + (test-tokenize-stream-literal) + (test-tokenize-stream-literal-in-tree) + (test-tokenize-indent) (test-infix) #? # always first run tests #? (run-tests) diff --git a/shell/infix.mu b/shell/infix.mu index f1d9d5d6..a0e81ff2 100644 --- a/shell/infix.mu +++ b/shell/infix.mu @@ -1,6 +1,7 @@ fn transform-infix x-ah: (addr handle cell), trace: (addr trace) { trace-text trace, "infix", "transform infix" trace-lower trace +#? trace-text trace, "infix", "todo" #? draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "a:", 2/fg 0/bg #? dump-cell-from-cursor-over-full-screen x-ah, 7/fg 0/bg transform-infix-2 x-ah, trace @@ -316,91 +317,18 @@ fn operator-symbol? _x: (addr cell) -> _/eax: boolean { } fn non-operator-grapheme? g: grapheme -> _/eax: boolean { - ## whitespace - compare g, 9/tab + var operator?/eax: boolean <- operator-grapheme? g + compare operator?, 0/false { - break-if-!= - return 0/false - } - compare g, 0xa/newline - { - break-if-!= - return 0/false - } - compare g, 0x20/space - { - break-if-!= - return 0/false - } - ## we don't really use double quotes - compare g, 0x22/double-quote - { - break-if-!= - return 1/true - } - ## brackets - compare g, 0x28/open-paren - { - break-if-!= - return 0/false - } - compare g, 0x29/close-paren - { - break-if-!= - return 0/false - } - compare g, 0x5b/open-square-bracket - { - break-if-!= - return 0/false - } - compare g, 0x5d/close-square-bracket - { - break-if-!= - return 0/false - } - compare g, 0x7b/open-curly-bracket - { - break-if-!= - return 0/false - } - compare g, 0x7d/close-curly-bracket - { - break-if-!= - return 0/false - } - # quotes and unquotes are like symbols for this purpose - compare g, 0x27/single-quote - { - break-if-!= - return 1/true - } - compare g, 0x60/backquote - { - break-if-!= - return 1/true - } - compare g, 0x2c/comma - { - break-if-!= - return 1/true - } - compare g, 0x40/at-sign - { - break-if-!= - return 1/true - } - # - other punctuation - compare g, 0x23/hash - { - break-if-!= + break-if-= return 0/false } return 1/true } +# just a short list of operator graphemes for now fn operator-grapheme? g: grapheme -> _/eax: boolean { - # '$' is a symbol char + # '$' is special and can be in either a symbol or operator compare g, 0x25/percent { break-if-!= @@ -411,26 +339,6 @@ fn operator-grapheme? g: grapheme -> _/eax: boolean { break-if-!= return 1/true } - compare g, 0x27/single-quote - { - break-if-!= - return 0/false - } - compare g, 0x60/backquote - { - break-if-!= - return 0/false - } - compare g, 0x2c/comma - { - break-if-!= - return 0/false - } - compare g, 0x40/at-sign - { - break-if-!= - return 0/false - } compare g, 0x2a/asterisk { break-if-!= diff --git a/shell/tokenize.mu b/shell/tokenize.mu index b75e57e1..3a080135 100644 --- a/shell/tokenize.mu +++ b/shell/tokenize.mu @@ -744,93 +744,91 @@ fn next-indent-token in: (addr gap-buffer), _out: (addr token), trace: (addr tra } } +# Mu carves up the space of graphemes into 4 categories: +# whitespace +# quotes and unquotes (from a Lisp perspective; doesn't include double +# quotes or other Unicode quotes) +# operators +# symbols +# (Numbers have their own parsing rules that don't fit cleanly in this +# partition.) +# +# During tokenization operators and symbols are treated identically. +# A later phase digs into that nuance. + fn symbol-grapheme? g: grapheme -> _/eax: boolean { - ## whitespace - compare g, 9/tab - { - break-if-!= - return 0/false - } - compare g, 0xa/newline + var whitespace?/eax: boolean <- whitespace-grapheme? g + compare whitespace?, 0/false { - break-if-!= - return 0/false - } - compare g, 0x20/space - { - break-if-!= - return 0/false - } - ## quotes - compare g, 0x22/double-quote - { - break-if-!= + break-if-= return 0/false } - compare g, 0x60/backquote + var quote-or-unquote?/eax: boolean <- quote-or-unquote-grapheme? g + compare quote-or-unquote?, 0/false { - break-if-!= + break-if-= return 0/false } - ## brackets - compare g, 0x28/open-paren + var bracket?/eax: boolean <- bracket-grapheme? g + compare bracket?, 0/false { - break-if-!= + break-if-= return 0/false } - compare g, 0x29/close-paren + compare g, 0x23/hash # comments get filtered out { break-if-!= return 0/false } - compare g, 0x5b/open-square-bracket + compare g, 0x22/double-quote # double quotes reserved for now { break-if-!= return 0/false } - compare g, 0x5d/close-square-bracket + return 1/true +} + +fn whitespace-grapheme? g: grapheme -> _/eax: boolean { + compare g, 9/tab { break-if-!= - return 0/false + return 1/true } - compare g, 0x7b/open-curly-bracket + compare g, 0xa/newline { break-if-!= - return 0/false + return 1/true } - compare g, 0x7d/close-curly-bracket + compare g, 0x20/space { break-if-!= - return 0/false + return 1/true } - # quotes and unquotes + return 0/false +} + +fn quote-or-unquote-grapheme? g: grapheme -> _/eax: boolean { compare g, 0x27/single-quote { break-if-!= - return 0/false + return 1/true } compare g, 0x60/backquote { break-if-!= - return 0/false + return 1/true } compare g, 0x2c/comma { break-if-!= - return 0/false + return 1/true } compare g, 0x40/at-sign { break-if-!= - return 0/false - } - # - other punctuation - compare g, 0x23/hash - { - break-if-!= - return 0/false + return 1/true } - return 1/true + return 0/false } fn bracket-grapheme? g: grapheme -> _/eax: boolean { -- cgit 1.4.1-2-gfad0