https://github.com/akkartik/mu/blob/main/shell/tokenize.mu
   1 # We reuse the cell data structure for tokenization
   2 # Token cells are special, though. They have no type, they're always atoms,
   3 # they always have text-data.
   4 
   5 fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
   6   trace-text trace, "tokenize", "tokenize"
   7   trace-lower trace
   8   rewind-gap-buffer in
   9   var token-storage: cell
  10   var token/edx: (addr cell) <- address token-storage
  11   {
  12     skip-whitespace-from-gap-buffer in
  13     var done?/eax: boolean <- gap-buffer-scan-done? in
  14     compare done?, 0/false
  15     break-if-!=
  16     #
  17     next-token in, token, trace
  18     var error?/eax: boolean <- has-errors? trace
  19     compare error?, 0/false
  20     {
  21       break-if-=
  22       return
  23     }
  24     var skip?/eax: boolean <- comment-token? token
  25     compare skip?, 0/false
  26     loop-if-!=
  27     write-to-stream out, token  # shallow-copy text-data
  28     loop
  29   }
  30   trace-higher trace
  31 }
  32 
  33 fn test-tokenize-number {
  34   var in-storage: gap-buffer
  35   var in/esi: (addr gap-buffer) <- address in-storage
  36   initialize-gap-buffer-with in, "123 a"
  37   #
  38   var stream-storage: (stream cell 0x10)
  39   var stream/edi: (addr stream cell) <- address stream-storage
  40   #
  41   var trace-storage: trace
  42   var trace/edx: (addr trace) <- address trace-storage
  43   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
  44   tokenize in, stream, trace
  45   #
  46   var curr-token-storage: cell
  47   var curr-token/ebx: (addr cell) <- address curr-token-storage
  48   read-from-stream stream, curr-token
  49   var number?/eax: boolean <- number-token? curr-token
  50   check number?, "F - test-tokenize-number"
  51   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
  52   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
  53   check-stream-equal curr-token-data, "123", "F - test-tokenize-number: value"
  54 }
  55 
  56 fn test-tokenize-negative-number {
  57   var in-storage: gap-buffer
  58   var in/esi: (addr gap-buffer) <- address in-storage
  59   initialize-gap-buffer-with in, "-123 a"
  60   #
  61   var stream-storage: (stream cell 0x10)
  62   var stream/edi: (addr stream cell) <- address stream-storage
  63   #
  64   var trace-storage: trace
  65   var trace/edx: (addr trace) <- address trace-storage
  66   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
  67   tokenize in, stream, trace
  68   #
  69   var curr-token-storage: cell
  70   var curr-token/ebx: (addr cell) <- address curr-token-storage
  71   read-from-stream stream, curr-token
  72   var number?/eax: boolean <- number-token? curr-token
  73   check number?, "F - test-tokenize-negative-number"
  74   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
  75   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
  76   check-stream-equal curr-token-data, "-123", "F - test-tokenize-negative-number: value"
  77 }
  78 
  79 fn test-tokenize-number-followed-by-hyphen {
  80   var in-storage: gap-buffer
  81   var in/esi: (addr gap-buffer) <- address in-storage
  82   initialize-gap-buffer-with in, "123-4 a"
  83   #
  84   var stream-storage: (stream cell 0x10)
  85   var stream/edi: (addr stream cell) <- address stream-storage
  86   #
  87   var trace-storage: trace
  88   var trace/edx: (addr trace) <- address trace-storage
  89   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
  90   tokenize in, stream, trace
  91   #
  92   var curr-token-storage: cell
  93   var curr-token/ebx: (addr cell) <- address curr-token-storage
  94   read-from-stream stream, curr-token
  95   var number?/eax: boolean <- number-token? curr-token
  96   check number?, "F - test-tokenize-number-followed-by-hyphen"
  97   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
  98   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
  99   check-stream-equal curr-token-data, "123", "F - test-tokenize-number-followed-by-hyphen: value"
 100 }
 101 
 102 fn test-tokenize-quote {
 103   var in-storage: gap-buffer
 104   var in/esi: (addr gap-buffer) <- address in-storage
 105   initialize-gap-buffer-with in, "'(a)"
 106   #
 107   var stream-storage: (stream cell 0x10)
 108   var stream/edi: (addr stream cell) <- address stream-storage
 109   #
 110   var trace-storage: trace
 111   var trace/edx: (addr trace) <- address trace-storage
 112   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 113   tokenize in, stream, trace
 114   #
 115   var curr-token-storage: cell
 116   var curr-token/ebx: (addr cell) <- address curr-token-storage
 117   read-from-stream stream, curr-token
 118   var quote?/eax: boolean <- quote-token? curr-token
 119   check quote?, "F - test-tokenize-quote: quote"
 120   read-from-stream stream, curr-token
 121   var open-paren?/eax: boolean <- open-paren-token? curr-token
 122   check open-paren?, "F - test-tokenize-quote: open paren"
 123   read-from-stream stream, curr-token  # skip a
 124   read-from-stream stream, curr-token
 125   var close-paren?/eax: boolean <- close-paren-token? curr-token
 126   check close-paren?, "F - test-tokenize-quote: close paren"
 127 }
 128 
 129 fn test-tokenize-backquote {
 130   var in-storage: gap-buffer
 131   var in/esi: (addr gap-buffer) <- address in-storage
 132   initialize-gap-buffer-with in, "`(a)"
 133   #
 134   var stream-storage: (stream cell 0x10)
 135   var stream/edi: (addr stream cell) <- address stream-storage
 136   #
 137   var trace-storage: trace
 138   var trace/edx: (addr trace) <- address trace-storage
 139   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 140   tokenize in, stream, trace
 141   #
 142   var curr-token-storage: cell
 143   var curr-token/ebx: (addr cell) <- address curr-token-storage
 144   read-from-stream stream, curr-token
 145   var backquote?/eax: boolean <- backquote-token? curr-token
 146   check backquote?, "F - test-tokenize-backquote: backquote"
 147   read-from-stream stream, curr-token
 148   var open-paren?/eax: boolean <- open-paren-token? curr-token
 149   check open-paren?, "F - test-tokenize-backquote: open paren"
 150   read-from-stream stream, curr-token  # skip a
 151   read-from-stream stream, curr-token
 152   var close-paren?/eax: boolean <- close-paren-token? curr-token
 153   check close-paren?, "F - test-tokenize-backquote: close paren"
 154 }
 155 
 156 fn test-tokenize-unquote {
 157   var in-storage: gap-buffer
 158   var in/esi: (addr gap-buffer) <- address in-storage
 159   initialize-gap-buffer-with in, ",(a)"
 160   #
 161   var stream-storage: (stream cell 0x10)
 162   var stream/edi: (addr stream cell) <- address stream-storage
 163   #
 164   var trace-storage: trace
 165   var trace/edx: (addr trace) <- address trace-storage
 166   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 167   tokenize in, stream, trace
 168   #
 169   var curr-token-storage: cell
 170   var curr-token/ebx: (addr cell) <- address curr-token-storage
 171   read-from-stream stream, curr-token
 172   var unquote?/eax: boolean <- unquote-token? curr-token
 173   check unquote?, "F - test-tokenize-unquote: unquote"
 174   read-from-stream stream, curr-token
 175   var open-paren?/eax: boolean <- open-paren-token? curr-token
 176   check open-paren?, "F - test-tokenize-unquote: open paren"
 177   read-from-stream stream, curr-token  # skip a
 178   read-from-stream stream, curr-token
 179   var close-paren?/eax: boolean <- close-paren-token? curr-token
 180   check close-paren?, "F - test-tokenize-unquote: close paren"
 181 }
 182 
 183 fn test-tokenize-unquote-splice {
 184   var in-storage: gap-buffer
 185   var in/esi: (addr gap-buffer) <- address in-storage
 186   initialize-gap-buffer-with in, ",@a"
 187   #
 188   var stream-storage: (stream cell 0x10)
 189   var stream/edi: (addr stream cell) <- address stream-storage
 190   #
 191   var trace-storage: trace
 192   var trace/edx: (addr trace) <- address trace-storage
 193   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 194   tokenize in, stream, trace
 195   #
 196   var curr-token-storage: cell
 197   var curr-token/ebx: (addr cell) <- address curr-token-storage
 198   read-from-stream stream, curr-token
 199   var unquote-splice?/eax: boolean <- unquote-splice-token? curr-token
 200   check unquote-splice?, "F - test-tokenize-unquote-splice: unquote-splice"
 201 }
 202 
 203 fn test-tokenize-dotted-list {
 204   var in-storage: gap-buffer
 205   var in/esi: (addr gap-buffer) <- address in-storage
 206   initialize-gap-buffer-with in, "(a . b)"
 207   #
 208   var stream-storage: (stream cell 0x10)
 209   var stream/edi: (addr stream cell) <- address stream-storage
 210   #
 211   var trace-storage: trace
 212   var trace/edx: (addr trace) <- address trace-storage
 213   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 214   tokenize in, stream, trace
 215   #
 216   var curr-token-storage: cell
 217   var curr-token/ebx: (addr cell) <- address curr-token-storage
 218   read-from-stream stream, curr-token
 219   var open-paren?/eax: boolean <- open-paren-token? curr-token
 220   check open-paren?, "F - test-tokenize-dotted-list: open paren"
 221   read-from-stream stream, curr-token  # skip a
 222   read-from-stream stream, curr-token
 223   var dot?/eax: boolean <- dot-token? curr-token
 224   check dot?, "F - test-tokenize-dotted-list: dot"
 225   read-from-stream stream, curr-token  # skip b
 226   read-from-stream stream, curr-token
 227   var close-paren?/eax: boolean <- close-paren-token? curr-token
 228   check close-paren?, "F - test-tokenize-dotted-list: close paren"
 229 }
 230 
 231 fn test-tokenize-stream-literal {
 232   var in-storage: gap-buffer
 233   var in/esi: (addr gap-buffer) <- address in-storage
 234   initialize-gap-buffer-with in, "[abc def]"
 235   #
 236   var stream-storage: (stream cell 0x10)
 237   var stream/edi: (addr stream cell) <- address stream-storage
 238   #
 239   var trace-storage: trace
 240   var trace/edx: (addr trace) <- address trace-storage
 241   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 242   tokenize in, stream, trace
 243   #
 244   var curr-token-storage: cell
 245   var curr-token/ebx: (addr cell) <- address curr-token-storage
 246   read-from-stream stream, curr-token
 247   var stream?/eax: boolean <- stream-token? curr-token
 248   check stream?, "F - test-tokenize-stream-literal: type"
 249   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
 250   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
 251   var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
 252   check data-equal?, "F - test-tokenize-stream-literal"
 253   var empty?/eax: boolean <- stream-empty? stream
 254   check empty?, "F - test-tokenize-stream-literal: empty?"
 255 }
 256 
 257 fn test-tokenize-stream-literal-in-tree {
 258   var in-storage: gap-buffer
 259   var in/esi: (addr gap-buffer) <- address in-storage
 260   initialize-gap-buffer-with in, "([abc def])"
 261   #
 262   var stream-storage: (stream cell 0x10)
 263   var stream/edi: (addr stream cell) <- address stream-storage
 264   #
 265   var trace-storage: trace
 266   var trace/edx: (addr trace) <- address trace-storage
 267   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 268   tokenize in, stream, trace
 269   #
 270   var curr-token-storage: cell
 271   var curr-token/ebx: (addr cell) <- address curr-token-storage
 272   read-from-stream stream, curr-token
 273   var bracket?/eax: boolean <- bracket-token? curr-token
 274   check bracket?, "F - test-tokenize-stream-literal-in-tree: open paren"
 275   read-from-stream stream, curr-token
 276   var stream?/eax: boolean <- stream-token? curr-token
 277   check stream?, "F - test-tokenize-stream-literal-in-tree: type"
 278   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
 279   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
 280   var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
 281   check data-equal?, "F - test-tokenize-stream-literal-in-tree"
 282   read-from-stream stream, curr-token
 283   var bracket?/eax: boolean <- bracket-token? curr-token
 284   check bracket?, "F - test-tokenize-stream-literal-in-tree: close paren"
 285   var empty?/eax: boolean <- stream-empty? stream
 286   check empty?, "F - test-tokenize-stream-literal-in-tree: empty?"
 287 }
 288 
 289 fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
 290   trace-text trace, "tokenize", "next-token"
 291   trace-lower trace
 292   var _g/eax: grapheme <- peek-from-gap-buffer in
 293   var g/ecx: grapheme <- copy _g
 294   {
 295     var should-trace?/eax: boolean <- should-trace? trace
 296     compare should-trace?, 0/false
 297     break-if-=
 298     var stream-storage: (stream byte 0x40)
 299     var stream/esi: (addr stream byte) <- address stream-storage
 300     write stream, "next: "
 301     var gval/eax: int <- copy g
 302     write-int32-hex stream, gval
 303     trace trace, "tokenize", stream
 304   }
 305   var out-cell/eax: (addr cell) <- copy _out-cell
 306   {
 307     var out-cell-type/eax: (addr int) <- get out-cell, type
 308     copy-to *out-cell-type, 0/uninitialized
 309   }
 310   var out-ah/edi: (addr handle stream byte) <- get out-cell, text-data
 311   $next-token:allocate: {
 312     # Allocate a large buffer if it's a stream.
 313     # Sometimes a whole function definition will need to fit in it.
 314     compare g, 0x5b/open-square-bracket
 315     {
 316       break-if-!=
 317       populate-stream out-ah, 0x400/max-definition-size=1KB
 318       break $next-token:allocate
 319     }
 320     populate-stream out-ah, 0x40
 321   }
 322   var _out/eax: (addr stream byte) <- lookup *out-ah
 323   var out/edi: (addr stream byte) <- copy _out
 324   clear-stream out
 325   $next-token:case: {
 326     # open square brackets begin streams
 327     {
 328       compare g, 0x5b/open-square-bracket
 329       break-if-!=
 330       var dummy/eax: grapheme <- read-from-gap-buffer in  # skip open bracket
 331       next-stream-token in, out, trace
 332       var out-cell/eax: (addr cell) <- copy _out-cell
 333       # streams set the type
 334       var out-cell-type/eax: (addr int) <- get out-cell, type
 335       copy-to *out-cell-type, 3/stream
 336       break $next-token:case
 337     }
 338     # comment
 339     {
 340       compare g, 0x23/comment
 341       break-if-!=
 342       rest-of-line in, out, trace
 343       break $next-token:case
 344     }
 345     # special-case: '-'
 346     {
 347       compare g, 0x2d/minus
 348       break-if-!=
 349       var dummy/eax: grapheme <- read-from-gap-buffer in  # skip '-'
 350       var g2/eax: grapheme <- peek-from-gap-buffer in
 351       put-back-from-gap-buffer in
 352       var digit?/eax: boolean <- decimal-digit? g2
 353       compare digit?, 0/false
 354       break-if-=
 355       next-number-token in, out, trace
 356       break $next-token:case
 357     }
 358     # digit
 359     {
 360       var digit?/eax: boolean <- decimal-digit? g
 361       compare digit?, 0/false
 362       break-if-=
 363       next-number-token in, out, trace
 364       break $next-token:case
 365     }
 366     # other symbol char
 367     {
 368       var symbol?/eax: boolean <- symbol-grapheme? g
 369       compare symbol?, 0/false
 370       break-if-=
 371       next-symbol-token in, out, trace
 372       break $next-token:case
 373     }
 374     # unbalanced close square brackets are errors
 375     {
 376       compare g, 0x5d/close-square-bracket
 377       break-if-!=
 378       error trace, "unbalanced ']'"
 379       return
 380     }
 381     # other brackets are always single-char tokens
 382     {
 383       var bracket?/eax: boolean <- bracket-grapheme? g
 384       compare bracket?, 0/false
 385       break-if-=
 386       var g/eax: grapheme <- read-from-gap-buffer in
 387       next-bracket-token g, out, trace
 388       break $next-token:case
 389     }
 390     # non-symbol operators
 391     {
 392       var operator?/eax: boolean <- operator-grapheme? g
 393       compare operator?, 0/false
 394       break-if-=
 395       next-operator-token in, out, trace
 396       break $next-token:case
 397     }
 398     # quote
 399     {
 400       compare g, 0x27/single-quote
 401       break-if-!=
 402       var g/eax: grapheme <- read-from-gap-buffer in  # consume
 403       write-grapheme out, g
 404       break $next-token:case
 405     }
 406     # backquote
 407     {
 408       compare g, 0x60/backquote
 409       break-if-!=
 410       var g/eax: grapheme <- read-from-gap-buffer in  # consume
 411       write-grapheme out, g
 412       break $next-token:case
 413     }
 414     # unquote
 415     {
 416       compare g, 0x2c/comma
 417       break-if-!=
 418       var g/eax: grapheme <- read-from-gap-buffer in  # consume
 419       write-grapheme out, g
 420       # check for unquote-splice
 421       {
 422         var g2/eax: grapheme <- peek-from-gap-buffer in
 423         compare g2, 0x40/at-sign
 424         break-if-!=
 425         g2 <- read-from-gap-buffer in
 426         write-grapheme out, g2
 427       }
 428       break $next-token:case
 429     }
 430     abort "unknown token type"
 431   }
 432   trace-higher trace
 433   {
 434     var should-trace?/eax: boolean <- should-trace? trace
 435     compare should-trace?, 0/false
 436     break-if-=
 437     var stream-storage: (stream byte 0x400)  # maximum possible token size (next-stream-token)
 438     var stream/eax: (addr stream byte) <- address stream-storage
 439     write stream, "=> "
 440     rewind-stream out
 441     write-stream stream, out
 442     trace trace, "tokenize", stream
 443   }
 444 }
 445 
 446 fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
 447   trace-text trace, "tokenize", "looking for a symbol"
 448   trace-lower trace
 449   $next-symbol-token:loop: {
 450     var done?/eax: boolean <- gap-buffer-scan-done? in
 451     compare done?, 0/false
 452     break-if-!=
 453     var g/eax: grapheme <- peek-from-gap-buffer in
 454     {
 455       {
 456         var should-trace?/eax: boolean <- should-trace? trace
 457         compare should-trace?, 0/false
 458       }
 459       break-if-=
 460       var stream-storage: (stream byte 0x40)
 461       var stream/esi: (addr stream byte) <- address stream-storage
 462       write stream, "next: "
 463       var gval/eax: int <- copy g
 464       write-int32-hex stream, gval
 465       trace trace, "tokenize", stream
 466     }
 467     # if non-symbol, return
 468     {
 469       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
 470       compare symbol-grapheme?, 0/false
 471       break-if-!=
 472       trace-text trace, "tokenize", "stop"
 473       break $next-symbol-token:loop
 474     }
 475     var g/eax: grapheme <- read-from-gap-buffer in
 476     write-grapheme out, g
 477     loop
 478   }
 479   trace-higher trace
 480   {
 481     var should-trace?/eax: boolean <- should-trace? trace
 482     compare should-trace?, 0/false
 483     break-if-=
 484     var stream-storage: (stream byte 0x40)
 485     var stream/esi: (addr stream byte) <- address stream-storage
 486     write stream, "=> "
 487     rewind-stream out
 488     write-stream stream, out
 489     trace trace, "tokenize", stream
 490   }
 491 }
 492 
 493 fn next-operator-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
 494   trace-text trace, "tokenize", "looking for a operator"
 495   trace-lower trace
 496   $next-operator-token:loop: {
 497     var done?/eax: boolean <- gap-buffer-scan-done? in
 498     compare done?, 0/false
 499     break-if-!=
 500     var g/eax: grapheme <- peek-from-gap-buffer in
 501     {
 502       {
 503         var should-trace?/eax: boolean <- should-trace? trace
 504         compare should-trace?, 0/false
 505       }
 506       break-if-=
 507       var stream-storage: (stream byte 0x40)
 508       var stream/esi: (addr stream byte) <- address stream-storage
 509       write stream, "next: "
 510       var gval/eax: int <- copy g
 511       write-int32-hex stream, gval
 512       trace trace, "tokenize", stream
 513     }
 514     # if non-operator, return
 515     {
 516       var operator-grapheme?/eax: boolean <- operator-grapheme? g
 517       compare operator-grapheme?, 0/false
 518       break-if-!=
 519       trace-text trace, "tokenize", "stop"
 520       break $next-operator-token:loop
 521     }
 522     var g/eax: grapheme <- read-from-gap-buffer in
 523     write-grapheme out, g
 524     loop
 525   }
 526   trace-higher trace
 527   {
 528     var should-trace?/eax: boolean <- should-trace? trace
 529     compare should-trace?, 0/false
 530     break-if-=
 531     var stream-storage: (stream byte 0x40)
 532     var stream/esi: (addr stream byte) <- address stream-storage
 533     write stream, "=> "
 534     rewind-stream out
 535     write-stream stream, out
 536     trace trace, "tokenize", stream
 537   }
 538 }
 539 
 540 fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
 541   trace-text trace, "tokenize", "looking for a number"
 542   trace-lower trace
 543   $next-number-token:check-minus: {
 544     var g/eax: grapheme <- peek-from-gap-buffer in
 545     compare g, 0x2d/minus
 546     g <- read-from-gap-buffer in  # consume
 547     write-grapheme out, g
 548   }
 549   $next-number-token:loop: {
 550     var done?/eax: boolean <- gap-buffer-scan-done? in
 551     compare done?, 0/false
 552     break-if-!=
 553     var g/eax: grapheme <- peek-from-gap-buffer in
 554     {
 555       {
 556         var should-trace?/eax: boolean <- should-trace? trace
 557         compare should-trace?, 0/false
 558       }
 559       break-if-=
 560       var stream-storage: (stream byte 0x40)
 561       var stream/esi: (addr stream byte) <- address stream-storage
 562       write stream, "next: "
 563       var gval/eax: int <- copy g
 564       write-int32-hex stream, gval
 565       trace trace, "tokenize", stream
 566     }
 567     # if not symbol grapheme, return
 568     {
 569       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
 570       compare symbol-grapheme?, 0/false
 571       break-if-!=
 572       trace-text trace, "tokenize", "stop"
 573       break $next-number-token:loop
 574     }
 575     # if not digit grapheme, abort
 576     {
 577       var digit?/eax: boolean <- decimal-digit? g
 578       compare digit?, 0/false
 579       break-if-!=
 580       error trace, "invalid number"
 581       return
 582     }
 583     trace-text trace, "tokenize", "append"
 584     var g/eax: grapheme <- read-from-gap-buffer in
 585     write-grapheme out, g
 586     loop
 587   }
 588   trace-higher trace
 589 }
 590 
 591 fn next-stream-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
 592   trace-text trace, "tokenize", "stream"
 593   {
 594     var empty?/eax: boolean <- gap-buffer-scan-done? in
 595     compare empty?, 0/false
 596     {
 597       break-if-=
 598       error trace, "unbalanced '['"
 599       return
 600     }
 601     var g/eax: grapheme <- read-from-gap-buffer in
 602     compare g, 0x5d/close-square-bracket
 603     break-if-=
 604     write-grapheme out, g
 605     loop
 606   }
 607   {
 608     var should-trace?/eax: boolean <- should-trace? trace
 609     compare should-trace?, 0/false
 610     break-if-=
 611     var stream-storage: (stream byte 0x400)  # max-definition-size
 612     var stream/esi: (addr stream byte) <- address stream-storage
 613     write stream, "=> "
 614     rewind-stream out
 615     write-stream stream, out
 616     trace trace, "tokenize", stream
 617   }
 618 }
 619 
 620 fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
 621   trace-text trace, "tokenize", "bracket"
 622   write-grapheme out, g
 623   {
 624     var should-trace?/eax: boolean <- should-trace? trace
 625     compare should-trace?, 0/false
 626     break-if-=
 627     var stream-storage: (stream byte 0x40)
 628     var stream/esi: (addr stream byte) <- address stream-storage
 629     write stream, "=> "
 630     rewind-stream out
 631     write-stream stream, out
 632     trace trace, "tokenize", stream
 633   }
 634 }
 635 
 636 fn rest-of-line in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
 637   trace-text trace, "tokenize", "comment"
 638   {
 639     var empty?/eax: boolean <- gap-buffer-scan-done? in
 640     compare empty?, 0/false
 641     {
 642       break-if-=
 643       return
 644     }
 645     var g/eax: grapheme <- read-from-gap-buffer in
 646     compare g, 0xa/newline
 647     break-if-=
 648     write-grapheme out, g
 649     loop
 650   }
 651   {
 652     var should-trace?/eax: boolean <- should-trace? trace
 653     compare should-trace?, 0/false
 654     break-if-=
 655     var stream-storage: (stream byte 0x80)
 656     var stream/esi: (addr stream byte) <- address stream-storage
 657     write stream, "=> "
 658     rewind-stream out
 659     write-stream stream, out
 660     trace trace, "tokenize", stream
 661   }
 662 }
 663 
 664 fn symbol-grapheme? g: grapheme -> _/eax: boolean {
 665   ## whitespace
 666   compare g, 9/tab
 667   {
 668     break-if-!=
 669     return 0/false
 670   }
 671   compare g, 0xa/newline
 672   {
 673     break-if-!=
 674     return 0/false
 675   }
 676   compare g, 0x20/space
 677   {
 678     break-if-!=
 679     return 0/false
 680   }
 681   ## quotes
 682   compare g, 0x22/double-quote
 683   {
 684     break-if-!=
 685     return 0/false
 686   }
 687   compare g, 0x60/backquote
 688   {
 689     break-if-!=
 690     return 0/false
 691   }
 692   ## brackets
 693   compare g, 0x28/open-paren
 694   {
 695     break-if-!=
 696     return 0/false
 697   }
 698   compare g, 0x29/close-paren
 699   {
 700     break-if-!=
 701     return 0/false
 702   }
 703   compare g, 0x5b/open-square-bracket
 704   {
 705     break-if-!=
 706     return 0/false
 707   }
 708   compare g, 0x5d/close-square-bracket
 709   {
 710     break-if-!=
 711     return 0/false
 712   }
 713   compare g, 0x7b/open-curly-bracket
 714   {
 715     break-if-!=
 716     return 0/false
 717   }
 718   compare g, 0x7d/close-curly-bracket
 719   {
 720     break-if-!=
 721     return 0/false
 722   }
 723   # - other punctuation
 724   # '!' is a symbol char
 725   compare g, 0x23/hash
 726   {
 727     break-if-!=
 728     return 0/false
 729   }
 730   # '$' is a symbol char
 731   compare g, 0x25/percent
 732   {
 733     break-if-!=
 734     return 0/false
 735   }
 736   compare g, 0x26/ampersand
 737   {
 738     break-if-!=
 739     return 0/false
 740   }
 741   compare g, 0x27/single-quote
 742   {
 743     break-if-!=
 744     return 0/false
 745   }
 746   compare g, 0x60/backquote
 747   {
 748     break-if-!=
 749     return 0/false
 750   }
 751   compare g, 0x2c/comma
 752   {
 753     break-if-!=
 754     return 0/false
 755   }
 756   compare g, 0x40/at-sign
 757   {
 758     break-if-!=
 759     return 0/false
 760   }
 761   compare g, 0x2a/asterisk
 762   {
 763     break-if-!=
 764     return 0/false
 765   }
 766   compare g, 0x2b/plus
 767   {
 768     break-if-!=
 769     return 0/false
 770   }
 771   compare g, 0x2d/dash  # '-' not allowed in symbols
 772   {
 773     break-if-!=
 774     return 0/false
 775   }
 776   compare g, 0x2e/period
 777   {
 778     break-if-!=
 779     return 0/false
 780   }
 781   compare g, 0x2f/slash
 782   {
 783     break-if-!=
 784     return 0/false
 785   }
 786   compare g, 0x3a/colon
 787   {
 788     break-if-!=
 789     return 0/false
 790   }
 791   compare g, 0x3b/semi-colon
 792   {
 793     break-if-!=
 794     return 0/false
 795   }
 796   compare g, 0x3c/less-than
 797   {
 798     break-if-!=
 799     return 0/false
 800   }
 801   compare g, 0x3d/equal
 802   {
 803     break-if-!=
 804     return 0/false
 805   }
 806   compare g, 0x3e/greater-than
 807   {
 808     break-if-!=
 809     return 0/false
 810   }
 811   # '?' is a symbol char
 812   compare g, 0x5c/backslash
 813   {
 814     break-if-!=
 815     return 0/false
 816   }
 817   compare g, 0x5e/caret
 818   {
 819     break-if-!=
 820     return 0/false
 821   }
 822   # '_' is a symbol char
 823   compare g, 0x7c/vertical-line
 824   {
 825     break-if-!=
 826     return 0/false
 827   }
 828   compare g, 0x7e/tilde
 829   {
 830     break-if-!=
 831     return 0/false
 832   }
 833   return 1/true
 834 }
 835 
 836 fn bracket-grapheme? g: grapheme -> _/eax: boolean {
 837   compare g, 0x28/open-paren
 838   {
 839     break-if-!=
 840     return 1/true
 841   }
 842   compare g, 0x29/close-paren
 843   {
 844     break-if-!=
 845     return 1/true
 846   }
 847   compare g, 0x5b/open-square-bracket
 848   {
 849     break-if-!=
 850     return 1/true
 851   }
 852   compare g, 0x5d/close-square-bracket
 853   {
 854     break-if-!=
 855     return 1/true
 856   }
 857   compare g, 0x7b/open-curly-bracket
 858   {
 859     break-if-!=
 860     return 1/true
 861   }
 862   compare g, 0x7d/close-curly-bracket
 863   {
 864     break-if-!=
 865     return 1/true
 866   }
 867   return 0/false
 868 }
 869 
 870 fn operator-grapheme? g: grapheme -> _/eax: boolean {
 871   # '$' is a symbol char
 872   compare g, 0x25/percent
 873   {
 874     break-if-!=
 875     return 1/false
 876   }
 877   compare g, 0x26/ampersand
 878   {
 879     break-if-!=
 880     return 1/true
 881   }
 882   compare g, 0x27/single-quote
 883   {
 884     break-if-!=
 885     return 0/true
 886   }
 887   compare g, 0x60/backquote
 888   {
 889     break-if-!=
 890     return 0/false
 891   }
 892   compare g, 0x2c/comma
 893   {
 894     break-if-!=
 895     return 0/false
 896   }
 897   compare g, 0x40/at-sign
 898   {
 899     break-if-!=
 900     return 0/false
 901   }
 902   compare g, 0x2a/asterisk
 903   {
 904     break-if-!=
 905     return 1/true
 906   }
 907   compare g, 0x2b/plus
 908   {
 909     break-if-!=
 910     return 1/true
 911   }
 912   compare g, 0x2d/dash  # '-' not allowed in symbols
 913   {
 914     break-if-!=
 915     return 1/true
 916   }
 917   compare g, 0x2e/period
 918   {
 919     break-if-!=
 920     return 1/true
 921   }
 922   compare g, 0x2f/slash
 923   {
 924     break-if-!=
 925     return 1/true
 926   }
 927   compare g, 0x3a/colon
 928   {
 929     break-if-!=
 930     return 1/true
 931   }
 932   compare g, 0x3b/semi-colon
 933   {
 934     break-if-!=
 935     return 1/true
 936   }
 937   compare g, 0x3c/less-than
 938   {
 939     break-if-!=
 940     return 1/true
 941   }
 942   compare g, 0x3d/equal
 943   {
 944     break-if-!=
 945     return 1/true
 946   }
 947   compare g, 0x3e/greater-than
 948   {
 949     break-if-!=
 950     return 1/true
 951   }
 952   # '?' is a symbol char
 953   compare g, 0x5c/backslash
 954   {
 955     break-if-!=
 956     return 1/true
 957   }
 958   compare g, 0x5e/caret
 959   {
 960     break-if-!=
 961     return 1/true
 962   }
 963   # '_' is a symbol char
 964   compare g, 0x7c/vertical-line
 965   {
 966     break-if-!=
 967     return 1/true
 968   }
 969   compare g, 0x7e/tilde
 970   {
 971     break-if-!=
 972     return 1/true
 973   }
 974   return 0/false
 975 }
 976 
 977 fn number-token? _in: (addr cell) -> _/eax: boolean {
 978   var in/eax: (addr cell) <- copy _in
 979   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
 980   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
 981   var in-data/ecx: (addr stream byte) <- copy _in-data
 982   rewind-stream in-data
 983   var g/eax: grapheme <- read-grapheme in-data
 984   # if '-', read another
 985   {
 986     compare g, 0x2d/minus
 987     break-if-!=
 988     g <- read-grapheme in-data
 989   }
 990   var result/eax: boolean <- decimal-digit? g
 991   return result
 992 }
 993 
 994 fn bracket-token? _in: (addr cell) -> _/eax: boolean {
 995   var in/eax: (addr cell) <- copy _in
 996   {
 997     var in-type/eax: (addr int) <- get in, type
 998     compare *in-type, 3/stream
 999     break-if-!=
1000     # streams are never paren tokens
1001     return 0/false
1002   }
1003   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1004   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
1005   rewind-stream in-data
1006   var g/eax: grapheme <- read-grapheme in-data
1007   var result/eax: boolean <- bracket-grapheme? g
1008   return result
1009 }
1010 
1011 fn quote-token? _in: (addr cell) -> _/eax: boolean {
1012   var in/eax: (addr cell) <- copy _in
1013   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1014   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
1015   rewind-stream in-data
1016   var result/eax: boolean <- stream-data-equal? in-data, "'"
1017   return result
1018 }
1019 
1020 fn backquote-token? _in: (addr cell) -> _/eax: boolean {
1021   var in/eax: (addr cell) <- copy _in
1022   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1023   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
1024   rewind-stream in-data
1025   var result/eax: boolean <- stream-data-equal? in-data, "`"
1026   return result
1027 }
1028 
1029 fn unquote-token? _in: (addr cell) -> _/eax: boolean {
1030   var in/eax: (addr cell) <- copy _in
1031   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1032   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
1033   rewind-stream in-data
1034   var result/eax: boolean <- stream-data-equal? in-data, ","
1035   return result
1036 }
1037 
1038 fn unquote-splice-token? _in: (addr cell) -> _/eax: boolean {
1039   var in/eax: (addr cell) <- copy _in
1040   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1041   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
1042   rewind-stream in-data
1043   var result/eax: boolean <- stream-data-equal? in-data, ",@"
1044   return result
1045 }
1046 
1047 fn open-paren-token? _in: (addr cell) -> _/eax: boolean {
1048   var in/eax: (addr cell) <- copy _in
1049   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1050   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
1051   var in-data/ecx: (addr stream byte) <- copy _in-data
1052   rewind-stream in-data
1053   var g/eax: grapheme <- read-grapheme in-data
1054   compare g, 0x28/open-paren
1055   {
1056     break-if-!=
1057     var result/eax: boolean <- stream-empty? in-data
1058     return result
1059   }
1060   return 0/false
1061 }
1062 
1063 fn close-paren-token? _in: (addr cell) -> _/eax: boolean {
1064   var in/eax: (addr cell) <- copy _in
1065   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1066   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
1067   var in-data/ecx: (addr stream byte) <- copy _in-data
1068   rewind-stream in-data
1069   var g/eax: grapheme <- read-grapheme in-data
1070   compare g, 0x29/close-paren
1071   {
1072     break-if-!=
1073     var result/eax: boolean <- stream-empty? in-data
1074     return result
1075   }
1076   return 0/false
1077 }
1078 
1079 fn dot-token? _in: (addr cell) -> _/eax: boolean {
1080   var in/eax: (addr cell) <- copy _in
1081   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1082   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
1083   var in-data/ecx: (addr stream byte) <- copy _in-data
1084   rewind-stream in-data
1085   var g/eax: grapheme <- read-grapheme in-data
1086   compare g, 0x2e/dot
1087   {
1088     break-if-!=
1089     var result/eax: boolean <- stream-empty? in-data
1090     return result
1091   }
1092   return 0/false
1093 }
1094 
1095 fn test-dot-token {
1096   var tmp-storage: (handle cell)
1097   var tmp-ah/eax: (addr handle cell) <- address tmp-storage
1098   new-symbol tmp-ah, "."
1099   var tmp/eax: (addr cell) <- lookup *tmp-ah
1100   var result/eax: boolean <- dot-token? tmp
1101   check result, "F - test-dot-token"
1102 }
1103 
1104 fn stream-token? _in: (addr cell) -> _/eax: boolean {
1105   var in/eax: (addr cell) <- copy _in
1106   var in-type/eax: (addr int) <- get in, type
1107   compare *in-type, 3/stream
1108   {
1109     break-if-=
1110     return 0/false
1111   }
1112   return 1/true
1113 }
1114 
1115 fn comment-token? _in: (addr cell) -> _/eax: boolean {
1116   var in/eax: (addr cell) <- copy _in
1117   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
1118   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
1119   rewind-stream in-data
1120   var g/eax: grapheme <- read-grapheme in-data
1121   compare g, 0x23/hash
1122   {
1123     break-if-=
1124     return 0/false
1125   }
1126   return 1/true
1127 }