https://github.com/akkartik/mu/blob/main/shell/tokenize.mu
   1 # The language is indent-sensitive.
   2 # Each line consists of an initial indent token followed by other tokens.
   3 type token {
   4   type: int
   5   # type 0: default
   6   # type 1: stream
   7   text-data: (handle stream byte)
   8   # type 2: skip (end of line or end of file)
   9   # type 3: indent
  10   number-data: int
  11 }
  12 
  13 fn tokenize in: (addr gap-buffer), out: (addr stream token), trace: (addr trace) {
  14   trace-text trace, "tokenize", "tokenize"
  15   trace-lower trace
  16   rewind-gap-buffer in
  17   var at-start-of-line?/edi: boolean <- copy 1/true
  18   {
  19     var done?/eax: boolean <- gap-buffer-scan-done? in
  20     compare done?, 0/false
  21     break-if-!=
  22     #
  23     var token-storage: token
  24     var token/edx: (addr token) <- address token-storage
  25     at-start-of-line? <- next-token in, token, at-start-of-line?, trace
  26     var error?/eax: boolean <- has-errors? trace
  27     compare error?, 0/false
  28     {
  29       break-if-=
  30       return
  31     }
  32     var skip?/eax: boolean <- skip-token? token
  33     compare skip?, 0/false
  34     loop-if-!=
  35     write-to-stream out, token  # shallow-copy text-data
  36     loop
  37   }
  38   trace-higher trace
  39 }
  40 
  41 fn test-tokenize-number {
  42   var in-storage: gap-buffer
  43   var in/esi: (addr gap-buffer) <- address in-storage
  44   initialize-gap-buffer-with in, "123 a"
  45   #
  46   var stream-storage: (stream token 0x10)
  47   var stream/edi: (addr stream token) <- address stream-storage
  48   #
  49   var trace-storage: trace
  50   var trace/edx: (addr trace) <- address trace-storage
  51   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
  52   tokenize in, stream, trace
  53   #
  54   var curr-token-storage: token
  55   var curr-token/ebx: (addr token) <- address curr-token-storage
  56   read-from-stream stream, curr-token
  57   var curr-token-type/eax: (addr int) <- get curr-token, type
  58   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-number/before-indent-type"
  59   var curr-token-data/eax: (addr int) <- get curr-token, number-data
  60   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-number/before-indent"
  61   read-from-stream stream, curr-token
  62   var number?/eax: boolean <- number-token? curr-token
  63   check number?, "F - test-tokenize-number"
  64   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
  65   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
  66   check-stream-equal curr-token-data, "123", "F - test-tokenize-number: value"
  67 }
  68 
  69 fn test-tokenize-negative-number {
  70   var in-storage: gap-buffer
  71   var in/esi: (addr gap-buffer) <- address in-storage
  72   initialize-gap-buffer-with in, "-123 a"
  73   #
  74   var stream-storage: (stream token 0x10)
  75   var stream/edi: (addr stream token) <- address stream-storage
  76   #
  77   var trace-storage: trace
  78   var trace/edx: (addr trace) <- address trace-storage
  79   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
  80   tokenize in, stream, trace
  81   #
  82   var curr-token-storage: token
  83   var curr-token/ebx: (addr token) <- address curr-token-storage
  84   read-from-stream stream, curr-token
  85   var curr-token-type/eax: (addr int) <- get curr-token, type
  86   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-negative-number/before-indent-type"
  87   var curr-token-data/eax: (addr int) <- get curr-token, number-data
  88   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-negative-number/before-indent"
  89   read-from-stream stream, curr-token
  90   var number?/eax: boolean <- number-token? curr-token
  91   check number?, "F - test-tokenize-negative-number"
  92   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
  93   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
  94   check-stream-equal curr-token-data, "-123", "F - test-tokenize-negative-number: value"
  95 }
  96 
  97 fn test-tokenize-quote {
  98   var in-storage: gap-buffer
  99   var in/esi: (addr gap-buffer) <- address in-storage
 100   initialize-gap-buffer-with in, "'(a)"
 101   #
 102   var stream-storage: (stream token 0x10)
 103   var stream/edi: (addr stream token) <- address stream-storage
 104   #
 105   var trace-storage: trace
 106   var trace/edx: (addr trace) <- address trace-storage
 107   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 108   tokenize in, stream, trace
 109   #
 110   var curr-token-storage: token
 111   var curr-token/ebx: (addr token) <- address curr-token-storage
 112   read-from-stream stream, curr-token
 113   var curr-token-type/eax: (addr int) <- get curr-token, type
 114   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-quote/before-indent-type"
 115   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 116   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-quote/before-indent"
 117   read-from-stream stream, curr-token
 118   var quote?/eax: boolean <- quote-token? curr-token
 119   check quote?, "F - test-tokenize-quote: quote"
 120   read-from-stream stream, curr-token
 121   var open-paren?/eax: boolean <- open-paren-token? curr-token
 122   check open-paren?, "F - test-tokenize-quote: open paren"
 123   read-from-stream stream, curr-token  # skip a
 124   read-from-stream stream, curr-token
 125   var close-paren?/eax: boolean <- close-paren-token? curr-token
 126   check close-paren?, "F - test-tokenize-quote: close paren"
 127 }
 128 
 129 fn test-tokenize-backquote {
 130   var in-storage: gap-buffer
 131   var in/esi: (addr gap-buffer) <- address in-storage
 132   initialize-gap-buffer-with in, "`(a)"
 133   #
 134   var stream-storage: (stream token 0x10)
 135   var stream/edi: (addr stream token) <- address stream-storage
 136   #
 137   var trace-storage: trace
 138   var trace/edx: (addr trace) <- address trace-storage
 139   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 140   tokenize in, stream, trace
 141   #
 142   var curr-token-storage: token
 143   var curr-token/ebx: (addr token) <- address curr-token-storage
 144   read-from-stream stream, curr-token
 145   var curr-token-type/eax: (addr int) <- get curr-token, type
 146   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-backquote/before-indent-type"
 147   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 148   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-backquote/before-indent"
 149   read-from-stream stream, curr-token
 150   var backquote?/eax: boolean <- backquote-token? curr-token
 151   check backquote?, "F - test-tokenize-backquote: backquote"
 152   read-from-stream stream, curr-token
 153   var open-paren?/eax: boolean <- open-paren-token? curr-token
 154   check open-paren?, "F - test-tokenize-backquote: open paren"
 155   read-from-stream stream, curr-token  # skip a
 156   read-from-stream stream, curr-token
 157   var close-paren?/eax: boolean <- close-paren-token? curr-token
 158   check close-paren?, "F - test-tokenize-backquote: close paren"
 159 }
 160 
 161 fn test-tokenize-unquote {
 162   var in-storage: gap-buffer
 163   var in/esi: (addr gap-buffer) <- address in-storage
 164   initialize-gap-buffer-with in, ",(a)"
 165   #
 166   var stream-storage: (stream token 0x10)
 167   var stream/edi: (addr stream token) <- address stream-storage
 168   #
 169   var trace-storage: trace
 170   var trace/edx: (addr trace) <- address trace-storage
 171   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 172   tokenize in, stream, trace
 173   #
 174   var curr-token-storage: token
 175   var curr-token/ebx: (addr token) <- address curr-token-storage
 176   read-from-stream stream, curr-token
 177   var curr-token-type/eax: (addr int) <- get curr-token, type
 178   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-unquote/before-indent-type"
 179   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 180   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-unquote/before-indent"
 181   read-from-stream stream, curr-token
 182   var unquote?/eax: boolean <- unquote-token? curr-token
 183   check unquote?, "F - test-tokenize-unquote: unquote"
 184   read-from-stream stream, curr-token
 185   var open-paren?/eax: boolean <- open-paren-token? curr-token
 186   check open-paren?, "F - test-tokenize-unquote: open paren"
 187   read-from-stream stream, curr-token  # skip a
 188   read-from-stream stream, curr-token
 189   var close-paren?/eax: boolean <- close-paren-token? curr-token
 190   check close-paren?, "F - test-tokenize-unquote: close paren"
 191 }
 192 
 193 fn test-tokenize-unquote-splice {
 194   var in-storage: gap-buffer
 195   var in/esi: (addr gap-buffer) <- address in-storage
 196   initialize-gap-buffer-with in, ",@a"
 197   #
 198   var stream-storage: (stream token 0x10)
 199   var stream/edi: (addr stream token) <- address stream-storage
 200   #
 201   var trace-storage: trace
 202   var trace/edx: (addr trace) <- address trace-storage
 203   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 204   tokenize in, stream, trace
 205   #
 206   var curr-token-storage: token
 207   var curr-token/ebx: (addr token) <- address curr-token-storage
 208   read-from-stream stream, curr-token
 209   var curr-token-type/eax: (addr int) <- get curr-token, type
 210   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-unquote-splice/before-indent-type"
 211   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 212   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-unquote-splice/before-indent"
 213   read-from-stream stream, curr-token
 214   var unquote-splice?/eax: boolean <- unquote-splice-token? curr-token
 215   check unquote-splice?, "F - test-tokenize-unquote-splice: unquote-splice"
 216 }
 217 
 218 fn test-tokenize-dotted-list {
 219   var in-storage: gap-buffer
 220   var in/esi: (addr gap-buffer) <- address in-storage
 221   initialize-gap-buffer-with in, "(a . b)"
 222   #
 223   var stream-storage: (stream token 0x10)
 224   var stream/edi: (addr stream token) <- address stream-storage
 225   #
 226   var trace-storage: trace
 227   var trace/edx: (addr trace) <- address trace-storage
 228   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 229   tokenize in, stream, trace
 230   #
 231   var curr-token-storage: token
 232   var curr-token/ebx: (addr token) <- address curr-token-storage
 233   read-from-stream stream, curr-token
 234   var curr-token-type/eax: (addr int) <- get curr-token, type
 235   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-dotted-list/before-indent-type"
 236   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 237   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-dotted-list/before-indent"
 238   read-from-stream stream, curr-token
 239   var open-paren?/eax: boolean <- open-paren-token? curr-token
 240   check open-paren?, "F - test-tokenize-dotted-list: open paren"
 241   read-from-stream stream, curr-token  # skip a
 242   read-from-stream stream, curr-token
 243   var dot?/eax: boolean <- dot-token? curr-token
 244   check dot?, "F - test-tokenize-dotted-list: dot"
 245   read-from-stream stream, curr-token  # skip b
 246   read-from-stream stream, curr-token
 247   var close-paren?/eax: boolean <- close-paren-token? curr-token
 248   check close-paren?, "F - test-tokenize-dotted-list: close paren"
 249 }
 250 
 251 fn test-tokenize-stream-literal {
 252   var in-storage: gap-buffer
 253   var in/esi: (addr gap-buffer) <- address in-storage
 254   initialize-gap-buffer-with in, "[abc def]"
 255   #
 256   var stream-storage: (stream token 0x10)
 257   var stream/edi: (addr stream token) <- address stream-storage
 258   #
 259   var trace-storage: trace
 260   var trace/edx: (addr trace) <- address trace-storage
 261   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 262   tokenize in, stream, trace
 263   #
 264   var curr-token-storage: token
 265   var curr-token/ebx: (addr token) <- address curr-token-storage
 266   read-from-stream stream, curr-token
 267   var curr-token-type/eax: (addr int) <- get curr-token, type
 268   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-stream-literal/before-indent-type"
 269   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 270   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-stream-literal/before-indent"
 271   read-from-stream stream, curr-token
 272   var stream?/eax: boolean <- stream-token? curr-token
 273   check stream?, "F - test-tokenize-stream-literal: type"
 274   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
 275   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
 276   var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
 277   check data-equal?, "F - test-tokenize-stream-literal"
 278   var empty?/eax: boolean <- stream-empty? stream
 279   check empty?, "F - test-tokenize-stream-literal: empty?"
 280 }
 281 
 282 fn test-tokenize-stream-literal-in-tree {
 283   var in-storage: gap-buffer
 284   var in/esi: (addr gap-buffer) <- address in-storage
 285   initialize-gap-buffer-with in, "([abc def])"
 286   #
 287   var stream-storage: (stream token 0x10)
 288   var stream/edi: (addr stream token) <- address stream-storage
 289   #
 290   var trace-storage: trace
 291   var trace/edx: (addr trace) <- address trace-storage
 292   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 293   tokenize in, stream, trace
 294   #
 295   var curr-token-storage: token
 296   var curr-token/ebx: (addr token) <- address curr-token-storage
 297   read-from-stream stream, curr-token
 298   var curr-token-type/eax: (addr int) <- get curr-token, type
 299   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-stream-literal-in-tree/before-indent-type"
 300   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 301   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-stream-literal-in-tree/before-indent"
 302   read-from-stream stream, curr-token
 303   var bracket?/eax: boolean <- bracket-token? curr-token
 304   check bracket?, "F - test-tokenize-stream-literal-in-tree: open paren"
 305   read-from-stream stream, curr-token
 306   var stream?/eax: boolean <- stream-token? curr-token
 307   check stream?, "F - test-tokenize-stream-literal-in-tree: type"
 308   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
 309   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
 310   var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
 311   check data-equal?, "F - test-tokenize-stream-literal-in-tree"
 312   read-from-stream stream, curr-token
 313   var bracket?/eax: boolean <- bracket-token? curr-token
 314   check bracket?, "F - test-tokenize-stream-literal-in-tree: close paren"
 315   var empty?/eax: boolean <- stream-empty? stream
 316   check empty?, "F - test-tokenize-stream-literal-in-tree: empty?"
 317 }
 318 
 319 fn test-tokenize-indent {
 320   var in-storage: gap-buffer
 321   var in/esi: (addr gap-buffer) <- address in-storage
 322   initialize-gap-buffer-with in, "abc\n  def"
 323   #
 324   var stream-storage: (stream token 0x10)
 325   var stream/edi: (addr stream token) <- address stream-storage
 326   #
 327   var trace-storage: trace
 328   var trace/edx: (addr trace) <- address trace-storage
 329   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
 330   tokenize in, stream, trace
 331   #
 332   var curr-token-storage: token
 333   var curr-token/ebx: (addr token) <- address curr-token-storage
 334   read-from-stream stream, curr-token
 335   var curr-token-type/eax: (addr int) <- get curr-token, type
 336   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-indent/before-indent-type"
 337   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 338   check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-indent/before-indent"
 339   read-from-stream stream, curr-token
 340   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
 341   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
 342   check-stream-equal curr-token-data, "abc", "F - test-tokenize-indent/before"
 343   #
 344   read-from-stream stream, curr-token
 345   var curr-token-type/eax: (addr int) <- get curr-token, type
 346   check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-indent/type"
 347   var curr-token-data/eax: (addr int) <- get curr-token, number-data
 348   check-ints-equal *curr-token-data, 2/spaces, "F - test-tokenize-indent"
 349   #
 350   read-from-stream stream, curr-token
 351   var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
 352   var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
 353   check-stream-equal curr-token-data, "def", "F - test-tokenize-indent/after"
 354 }
 355 
 356 # caller is responsible for threading start-of-line? between calls to next-token
 357 # 'in' may contain whitespace if start-of-line?
 358 fn next-token in: (addr gap-buffer), out: (addr token), start-of-line?: boolean, trace: (addr trace) -> _/edi: boolean {
 359   trace-text trace, "tokenize", "next-token"
 360   trace-lower trace
 361   # save an indent token if necessary
 362   {
 363     compare start-of-line?, 0/false
 364     break-if-=
 365     next-indent-token in, out, trace  # might not be returned
 366   }
 367   skip-spaces-from-gap-buffer in
 368   var g/eax: grapheme <- peek-from-gap-buffer in
 369   {
 370     compare g, 0x23/comment
 371     break-if-!=
 372     skip-rest-of-line in
 373   }
 374   var g/eax: grapheme <- peek-from-gap-buffer in
 375   {
 376     compare g, 0xa/newline
 377     break-if-!=
 378     trace-text trace, "tokenize", "newline"
 379     g <- read-from-gap-buffer in
 380     initialize-skip-token out  # might drop indent if that's all there was in this line
 381     trace-higher trace
 382     return 1/at-start-of-line
 383   }
 384   {
 385     compare start-of-line?, 0/false
 386     break-if-=
 387     # still here? no comment or newline? return saved indent
 388     trace-higher trace
 389     return 0/not-at-start-of-line
 390   }
 391   {
 392     var done?/eax: boolean <- gap-buffer-scan-done? in
 393     compare done?, 0/false
 394     break-if-=
 395     trace-text trace, "tokenize", "end"
 396     initialize-skip-token out
 397     trace-higher trace
 398     return 1/at-start-of-line
 399   }
 400   var _g/eax: grapheme <- peek-from-gap-buffer in
 401   var g/ecx: grapheme <- copy _g
 402   {
 403     var should-trace?/eax: boolean <- should-trace? trace
 404     compare should-trace?, 0/false
 405     break-if-=
 406     var stream-storage: (stream byte 0x40)
 407     var stream/esi: (addr stream byte) <- address stream-storage
 408     write stream, "next: "
 409     var gval/eax: int <- copy g
 410     write-int32-hex stream, gval
 411     trace trace, "tokenize", stream
 412   }
 413   $next-token:case: {
 414     # open square brackets begin streams
 415     {
 416       compare g, 0x5b/open-square-bracket
 417       break-if-!=
 418       var dummy/eax: grapheme <- read-from-gap-buffer in  # skip open bracket
 419       next-stream-token in, out, trace
 420       break $next-token:case
 421     }
 422     # other symbol char
 423     {
 424       var symbol?/eax: boolean <- symbol-grapheme? g
 425       compare symbol?, 0/false
 426       break-if-=
 427       next-symbol-token in, out, trace
 428       break $next-token:case
 429     }
 430     # unbalanced close square brackets are errors
 431     {
 432       compare g, 0x5d/close-square-bracket
 433       break-if-!=
 434       error trace, "unbalanced ']'"
 435       return start-of-line?
 436     }
 437     # other brackets are always single-char tokens
 438     {
 439       var bracket?/eax: boolean <- bracket-grapheme? g
 440       compare bracket?, 0/false
 441       break-if-=
 442       var g/eax: grapheme <- read-from-gap-buffer in
 443       next-bracket-token g, out, trace
 444       break $next-token:case
 445     }
 446     # quote
 447     {
 448       compare g, 0x27/single-quote
 449       break-if-!=
 450       var g/eax: grapheme <- read-from-gap-buffer in  # consume
 451       initialize-token out, "'"
 452       break $next-token:case
 453     }
 454     # backquote
 455     {
 456       compare g, 0x60/backquote
 457       break-if-!=
 458       var g/eax: grapheme <- read-from-gap-buffer in  # consume
 459       initialize-token out, "`"
 460       break $next-token:case
 461     }
 462     # unquote
 463     {
 464       compare g, 0x2c/comma
 465       break-if-!=
 466       var g/eax: grapheme <- read-from-gap-buffer in  # consume
 467       # check for unquote-splice
 468       {
 469         g <- peek-from-gap-buffer in
 470         compare g, 0x40/at-sign
 471         break-if-!=
 472         g <- read-from-gap-buffer in
 473         initialize-token out, ",@"
 474         break $next-token:case
 475       }
 476       initialize-token out, ","
 477       break $next-token:case
 478     }
 479     set-cursor-position 0/screen, 0x40 0x20
 480     {
 481       var foo/eax: int <- copy g
 482       draw-int32-decimal-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, foo, 7/fg 0/bg
 483     }
 484     abort "unknown token type"
 485   }
 486   trace-higher trace
 487   {
 488     var should-trace?/eax: boolean <- should-trace? trace
 489     compare should-trace?, 0/false
 490     break-if-=
 491     var stream-storage: (stream byte 0x400)  # maximum possible token size (next-stream-token)
 492     var stream/eax: (addr stream byte) <- address stream-storage
 493     write stream, "=> "
 494     write-token-text-data stream, out
 495     trace trace, "tokenize", stream
 496   }
 497   return start-of-line?
 498 }
 499 
 500 fn next-symbol-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
 501   trace-text trace, "tokenize", "looking for a symbol"
 502   trace-lower trace
 503   var out/eax: (addr token) <- copy _out
 504   var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
 505   populate-stream out-data-ah, 0x40/max-symbol-size
 506   var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
 507   var out-data/edi: (addr stream byte) <- copy _out-data
 508   $next-symbol-token:loop: {
 509     var done?/eax: boolean <- gap-buffer-scan-done? in
 510     compare done?, 0/false
 511     break-if-!=
 512     var g/eax: grapheme <- peek-from-gap-buffer in
 513     {
 514       {
 515         var should-trace?/eax: boolean <- should-trace? trace
 516         compare should-trace?, 0/false
 517       }
 518       break-if-=
 519       var stream-storage: (stream byte 0x40)
 520       var stream/esi: (addr stream byte) <- address stream-storage
 521       write stream, "next: "
 522       var gval/eax: int <- copy g
 523       write-int32-hex stream, gval
 524       trace trace, "tokenize", stream
 525     }
 526     # if non-symbol, return
 527     {
 528       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
 529       compare symbol-grapheme?, 0/false
 530       break-if-!=
 531       trace-text trace, "tokenize", "stop"
 532       break $next-symbol-token:loop
 533     }
 534     var g/eax: grapheme <- read-from-gap-buffer in
 535     write-grapheme out-data, g
 536     loop
 537   }
 538   trace-higher trace
 539   {
 540     var should-trace?/eax: boolean <- should-trace? trace
 541     compare should-trace?, 0/false
 542     break-if-=
 543     var stream-storage: (stream byte 0x40)
 544     var stream/esi: (addr stream byte) <- address stream-storage
 545     write stream, "=> "
 546     rewind-stream out-data
 547     write-stream stream, out-data
 548     trace trace, "tokenize", stream
 549   }
 550 }
 551 
 552 fn next-number-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
 553   trace-text trace, "tokenize", "looking for a number"
 554   trace-lower trace
 555   var out/eax: (addr token) <- copy _out
 556   var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
 557   populate-stream out-data-ah, 0x40
 558   var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
 559   var out-data/edi: (addr stream byte) <- copy _out-data
 560   $next-number-token:check-minus: {
 561     var g/eax: grapheme <- peek-from-gap-buffer in
 562     compare g, 0x2d/minus
 563     g <- read-from-gap-buffer in  # consume
 564     write-grapheme out-data, g
 565   }
 566   $next-number-token:loop: {
 567     var done?/eax: boolean <- gap-buffer-scan-done? in
 568     compare done?, 0/false
 569     break-if-!=
 570     var g/eax: grapheme <- peek-from-gap-buffer in
 571     {
 572       {
 573         var should-trace?/eax: boolean <- should-trace? trace
 574         compare should-trace?, 0/false
 575       }
 576       break-if-=
 577       var stream-storage: (stream byte 0x40)
 578       var stream/esi: (addr stream byte) <- address stream-storage
 579       write stream, "next: "
 580       var gval/eax: int <- copy g
 581       write-int32-hex stream, gval
 582       trace trace, "tokenize", stream
 583     }
 584     # if not symbol grapheme, return
 585     {
 586       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
 587       compare symbol-grapheme?, 0/false
 588       break-if-!=
 589       trace-text trace, "tokenize", "stop"
 590       break $next-number-token:loop
 591     }
 592     # if not digit grapheme, abort
 593     {
 594       var digit?/eax: boolean <- decimal-digit? g
 595       compare digit?, 0/false
 596       break-if-!=
 597       error trace, "invalid number"
 598       return
 599     }
 600     trace-text trace, "tokenize", "append"
 601     var g/eax: grapheme <- read-from-gap-buffer in
 602     write-grapheme out-data, g
 603     loop
 604   }
 605   trace-higher trace
 606 }
 607 
 608 fn next-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
 609   trace-text trace, "tokenize", "stream"
 610   var out/edi: (addr token) <- copy _out
 611   var out-type/eax: (addr int) <- get out, type
 612   copy-to *out-type, 1/stream
 613   var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
 614   # stream tokens contain whole function definitions on boot, so we always
 615   # give them plenty of space
 616   populate-stream out-data-ah, 0x400/max-definition-size=1KB
 617   var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
 618   var out-data/edi: (addr stream byte) <- copy _out-data
 619   {
 620     var empty?/eax: boolean <- gap-buffer-scan-done? in
 621     compare empty?, 0/false
 622     {
 623       break-if-=
 624       error trace, "unbalanced '['"
 625       return
 626     }
 627     var g/eax: grapheme <- read-from-gap-buffer in
 628     compare g, 0x5d/close-square-bracket
 629     break-if-=
 630     write-grapheme out-data, g
 631     loop
 632   }
 633   {
 634     var should-trace?/eax: boolean <- should-trace? trace
 635     compare should-trace?, 0/false
 636     break-if-=
 637     var stream-storage: (stream byte 0x400)  # max-definition-size
 638     var stream/esi: (addr stream byte) <- address stream-storage
 639     write stream, "=> "
 640     rewind-stream out-data
 641     write-stream stream, out-data
 642     trace trace, "tokenize", stream
 643   }
 644 }
 645 
 646 fn next-bracket-token g: grapheme, _out: (addr token), trace: (addr trace) {
 647   trace-text trace, "tokenize", "bracket"
 648   var out/eax: (addr token) <- copy _out
 649   var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
 650   populate-stream out-data-ah, 0x40
 651   var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
 652   var out-data/edi: (addr stream byte) <- copy _out-data
 653   write-grapheme out-data, g
 654   {
 655     var should-trace?/eax: boolean <- should-trace? trace
 656     compare should-trace?, 0/false
 657     break-if-=
 658     var stream-storage: (stream byte 0x40)
 659     var stream/esi: (addr stream byte) <- address stream-storage
 660     write stream, "=> "
 661     rewind-stream out-data
 662     write-stream stream, out-data
 663     trace trace, "tokenize", stream
 664   }
 665 }
 666 
 667 fn skip-rest-of-line in: (addr gap-buffer) {
 668   {
 669     var done?/eax: boolean <- gap-buffer-scan-done? in
 670     compare done?, 0/false
 671     break-if-!=
 672     var g/eax: grapheme <- peek-from-gap-buffer in
 673     compare g, 0xa/newline
 674     break-if-=
 675     g <- read-from-gap-buffer in  # consume
 676     loop
 677   }
 678 }
 679 
 680 fn next-indent-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
 681   trace-text trace, "tokenize", "indent"
 682   trace-lower trace
 683   var out/edi: (addr token) <- copy _out
 684   var out-type/eax: (addr int) <- get out, type
 685   copy-to *out-type, 3/indent
 686   var dest/edi: (addr int) <- get out, number-data
 687   copy-to *dest, 0
 688   {
 689     var done?/eax: boolean <- gap-buffer-scan-done? in
 690     compare done?, 0/false
 691     break-if-!=
 692     var g/eax: grapheme <- peek-from-gap-buffer in
 693     {
 694       {
 695         var should-trace?/eax: boolean <- should-trace? trace
 696         compare should-trace?, 0/false
 697       }
 698       break-if-=
 699       var stream-storage: (stream byte 0x40)
 700       var stream/esi: (addr stream byte) <- address stream-storage
 701       write stream, "next: "
 702       var gval/eax: int <- copy g
 703       write-int32-hex stream, gval
 704       trace trace, "tokenize", stream
 705     }
 706     # if non-space, break
 707     compare g, 0x20/space
 708     break-if-!=
 709     g <- read-from-gap-buffer in
 710     increment *dest
 711     loop
 712   }
 713   trace-higher trace
 714   {
 715     var should-trace?/eax: boolean <- should-trace? trace
 716     compare should-trace?, 0/false
 717     break-if-=
 718     var stream-storage: (stream byte 0x40)
 719     var stream/esi: (addr stream byte) <- address stream-storage
 720     write stream, "=> indent "
 721     write-int32-hex stream, *dest
 722     trace trace, "tokenize", stream
 723   }
 724 }
 725 
 726 # Mu carves up the space of graphemes into 4 categories:
 727 #   whitespace
 728 #   quotes and unquotes (from a Lisp perspective; doesn't include double
 729 #                        quotes or other Unicode quotes)
 730 #   operators
 731 #   symbols
 732 # (Numbers have their own parsing rules that don't fit cleanly in this
 733 # partition.)
 734 #
 735 # During tokenization operators and symbols are treated identically.
 736 # A later phase digs into that nuance.
 737 
 738 fn symbol-grapheme? g: grapheme -> _/eax: boolean {
 739   var whitespace?/eax: boolean <- whitespace-grapheme? g
 740   compare whitespace?, 0/false
 741   {
 742     break-if-=
 743     return 0/false
 744   }
 745   var quote-or-unquote?/eax: boolean <- quote-or-unquote-grapheme? g
 746   compare quote-or-unquote?, 0/false
 747   {
 748     break-if-=
 749     return 0/false
 750   }
 751   var bracket?/eax: boolean <- bracket-grapheme? g
 752   compare bracket?, 0/false
 753   {
 754     break-if-=
 755     return 0/false
 756   }
 757   compare g, 0x23/hash  # comments get filtered out
 758   {
 759     break-if-!=
 760     return 0/false
 761   }
 762   compare g, 0x22/double-quote  # double quotes reserved for now
 763   {
 764     break-if-!=
 765     return 0/false
 766   }
 767   return 1/true
 768 }
 769 
 770 fn whitespace-grapheme? g: grapheme -> _/eax: boolean {
 771   compare g, 9/tab
 772   {
 773     break-if-!=
 774     return 1/true
 775   }
 776   compare g, 0xa/newline
 777   {
 778     break-if-!=
 779     return 1/true
 780   }
 781   compare g, 0x20/space
 782   {
 783     break-if-!=
 784     return 1/true
 785   }
 786   return 0/false
 787 }
 788 
 789 fn quote-or-unquote-grapheme? g: grapheme -> _/eax: boolean {
 790   compare g, 0x27/single-quote
 791   {
 792     break-if-!=
 793     return 1/true
 794   }
 795   compare g, 0x60/backquote
 796   {
 797     break-if-!=
 798     return 1/true
 799   }
 800   compare g, 0x2c/comma
 801   {
 802     break-if-!=
 803     return 1/true
 804   }
 805   compare g, 0x40/at-sign
 806   {
 807     break-if-!=
 808     return 1/true
 809   }
 810   return 0/false
 811 }
 812 
 813 fn bracket-grapheme? g: grapheme -> _/eax: boolean {
 814   compare g, 0x28/open-paren
 815   {
 816     break-if-!=
 817     return 1/true
 818   }
 819   compare g, 0x29/close-paren
 820   {
 821     break-if-!=
 822     return 1/true
 823   }
 824   compare g, 0x5b/open-square-bracket
 825   {
 826     break-if-!=
 827     return 1/true
 828   }
 829   compare g, 0x5d/close-square-bracket
 830   {
 831     break-if-!=
 832     return 1/true
 833   }
 834   compare g, 0x7b/open-curly-bracket
 835   {
 836     break-if-!=
 837     return 1/true
 838   }
 839   compare g, 0x7d/close-curly-bracket
 840   {
 841     break-if-!=
 842     return 1/true
 843   }
 844   return 0/false
 845 }
 846 
 847 fn number-token? _self: (addr token) -> _/eax: boolean {
 848   var self/eax: (addr token) <- copy _self
 849   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 850   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
 851   var in-data/ecx: (addr stream byte) <- copy _in-data
 852   rewind-stream in-data
 853   var g/eax: grapheme <- read-grapheme in-data
 854   # if '-', read another
 855   {
 856     compare g, 0x2d/minus
 857     break-if-!=
 858     g <- read-grapheme in-data
 859   }
 860   {
 861     {
 862       var result/eax: boolean <- decimal-digit? g
 863       compare result, 0/false
 864       break-if-!=
 865       return 0/false
 866     }
 867     {
 868       var done?/eax: boolean <- stream-empty? in-data
 869       compare done?, 0/false
 870     }
 871     break-if-!=
 872     g <- read-grapheme in-data
 873     loop
 874   }
 875   return 1/true
 876 }
 877 
 878 fn bracket-token? _self: (addr token) -> _/eax: boolean {
 879   var self/eax: (addr token) <- copy _self
 880   {
 881     var in-type/eax: (addr int) <- get self, type
 882     compare *in-type, 1/stream
 883     break-if-!=
 884     # streams are never paren tokens
 885     return 0/false
 886   }
 887   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 888   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
 889   rewind-stream in-data
 890   var g/eax: grapheme <- read-grapheme in-data
 891   var result/eax: boolean <- bracket-grapheme? g
 892   return result
 893 }
 894 
 895 fn quote-token? _self: (addr token) -> _/eax: boolean {
 896   var self/eax: (addr token) <- copy _self
 897   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 898   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
 899   rewind-stream in-data
 900   var result/eax: boolean <- stream-data-equal? in-data, "'"
 901   return result
 902 }
 903 
 904 fn backquote-token? _self: (addr token) -> _/eax: boolean {
 905   var self/eax: (addr token) <- copy _self
 906   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 907   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
 908   rewind-stream in-data
 909   var result/eax: boolean <- stream-data-equal? in-data, "`"
 910   return result
 911 }
 912 
 913 fn unquote-token? _self: (addr token) -> _/eax: boolean {
 914   var self/eax: (addr token) <- copy _self
 915   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 916   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
 917   rewind-stream in-data
 918   var result/eax: boolean <- stream-data-equal? in-data, ","
 919   return result
 920 }
 921 
 922 fn unquote-splice-token? _self: (addr token) -> _/eax: boolean {
 923   var self/eax: (addr token) <- copy _self
 924   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 925   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
 926   rewind-stream in-data
 927   var result/eax: boolean <- stream-data-equal? in-data, ",@"
 928   return result
 929 }
 930 
 931 fn open-paren-token? _self: (addr token) -> _/eax: boolean {
 932   var self/eax: (addr token) <- copy _self
 933   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 934   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
 935   var in-data/ecx: (addr stream byte) <- copy _in-data
 936   rewind-stream in-data
 937   var g/eax: grapheme <- read-grapheme in-data
 938   compare g, 0x28/open-paren
 939   {
 940     break-if-!=
 941     var result/eax: boolean <- stream-empty? in-data
 942     return result
 943   }
 944   return 0/false
 945 }
 946 
 947 fn close-paren-token? _self: (addr token) -> _/eax: boolean {
 948   var self/eax: (addr token) <- copy _self
 949   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 950   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
 951   var in-data/ecx: (addr stream byte) <- copy _in-data
 952   rewind-stream in-data
 953   var g/eax: grapheme <- read-grapheme in-data
 954   compare g, 0x29/close-paren
 955   {
 956     break-if-!=
 957     var result/eax: boolean <- stream-empty? in-data
 958     return result
 959   }
 960   return 0/false
 961 }
 962 
 963 fn dot-token? _self: (addr token) -> _/eax: boolean {
 964   var self/eax: (addr token) <- copy _self
 965   var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
 966   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
 967   var in-data/ecx: (addr stream byte) <- copy _in-data
 968   rewind-stream in-data
 969   var g/eax: grapheme <- read-grapheme in-data
 970   compare g, 0x2e/dot
 971   {
 972     break-if-!=
 973     var result/eax: boolean <- stream-empty? in-data
 974     return result
 975   }
 976   return 0/false
 977 }
 978 
 979 fn test-dot-token {
 980   var tmp-storage: (handle token)
 981   var tmp-ah/eax: (addr handle token) <- address tmp-storage
 982   allocate-token tmp-ah
 983   var tmp/eax: (addr token) <- lookup *tmp-ah
 984   initialize-token tmp, "."
 985   var result/eax: boolean <- dot-token? tmp
 986   check result, "F - test-dot-token"
 987 }
 988 
 989 fn stream-token? _self: (addr token) -> _/eax: boolean {
 990   var self/eax: (addr token) <- copy _self
 991   var in-type/eax: (addr int) <- get self, type
 992   compare *in-type, 1/stream
 993   {
 994     break-if-=
 995     return 0/false
 996   }
 997   return 1/true
 998 }
 999 
1000 fn skip-token? _self: (addr token) -> _/eax: boolean {
1001   var self/eax: (addr token) <- copy _self
1002   var in-type/eax: (addr int) <- get self, type
1003   compare *in-type, 2/skip
1004   {
1005     break-if-=
1006     return 0/false
1007   }
1008   return 1/true
1009 }
1010 
1011 fn indent-token? _self: (addr token) -> _/eax: boolean {
1012   var self/eax: (addr token) <- copy _self
1013   var in-type/eax: (addr int) <- get self, type
1014   compare *in-type, 3/indent
1015   {
1016     break-if-=
1017     return 0/false
1018   }
1019   return 1/true
1020 }
1021 
1022 fn allocate-token _self-ah: (addr handle token) {
1023   var self-ah/eax: (addr handle token) <- copy _self-ah
1024   allocate self-ah
1025   var self/eax: (addr token) <- lookup *self-ah
1026   var dest-ah/eax: (addr handle stream byte) <- get self, text-data
1027   populate-stream dest-ah, 0x40/max-symbol-size
1028 }
1029 
1030 fn initialize-token _self: (addr token), val: (addr array byte) {
1031   var self/eax: (addr token) <- copy _self
1032   var dest-ah/eax: (addr handle stream byte) <- get self, text-data
1033   populate-stream dest-ah, 0x40
1034   var dest/eax: (addr stream byte) <- lookup *dest-ah
1035   write dest, val
1036 }
1037 
1038 fn initialize-skip-token _self: (addr token) {
1039   var self/eax: (addr token) <- copy _self
1040   var self-type/eax: (addr int) <- get self, type
1041   copy-to *self-type, 2/skip
1042 }
1043 
1044 fn write-token-text-data out: (addr stream byte), _self: (addr token) {
1045   var self/eax: (addr token) <- copy _self
1046   var data-ah/eax: (addr handle stream byte) <- get self, text-data
1047   var data/eax: (addr stream byte) <- lookup *data-ah
1048   rewind-stream data
1049   write-stream out, data
1050 }
1051 
1052 fn tokens-equal? _a: (addr token), _b: (addr token) -> _/eax: boolean {
1053   var a/edx: (addr token) <- copy _a
1054   var b/ebx: (addr token) <- copy _b
1055   var a-type-addr/eax: (addr int) <- get a, type
1056   var a-type/eax: int <- copy *a-type-addr
1057   var b-type-addr/ecx: (addr int) <- get b, type
1058   compare a-type, *b-type-addr
1059   {
1060     break-if-=
1061     return 0/false
1062   }
1063   compare a-type, 2/skip
1064   {
1065     break-if-!=
1066     # skip tokens have no other data
1067     return 1/true
1068   }
1069   compare a-type, 3/indent
1070   {
1071     break-if-!=
1072     # indent tokens have no other data
1073     var a-number-data-addr/eax: (addr int) <- get a, number-data
1074     var a-number-data/eax: int <- copy *a-number-data-addr
1075     var b-number-data-addr/ecx: (addr int) <- get b, number-data
1076     compare a-number-data, *b-number-data-addr
1077     {
1078       break-if-=
1079       return 0/false
1080     }
1081     return 1/true
1082   }
1083   var b-data-ah/eax: (addr handle stream byte) <- get b, text-data
1084   var _b-data/eax: (addr stream byte) <- lookup *b-data-ah
1085   var b-data/ebx: (addr stream byte) <- copy _b-data
1086   var a-data-ah/eax: (addr handle stream byte) <- get a, text-data
1087   var a-data/eax: (addr stream byte) <- lookup *a-data-ah
1088   var data-match?/eax: boolean <- streams-data-equal? a-data, b-data
1089   return data-match?
1090 }
1091 
1092 fn dump-token-from-cursor _t: (addr token) {
1093   var t/esi: (addr token) <- copy _t
1094   var type/eax: (addr int) <- get t, type
1095   draw-int32-decimal-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, *type, 7/fg 0/bg
1096   draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 7/fg 0/bg
1097   var text-ah/eax: (addr handle stream byte) <- get t, text-data
1098   var text/eax: (addr stream byte) <- lookup *text-ah
1099   rewind-stream text
1100   draw-stream-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, text, 7/fg 0/bg
1101   draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 7/fg 0/bg
1102   var num/eax: (addr int) <- get t, number-data
1103   draw-int32-decimal-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, *num, 7/fg 0/bg
1104   draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "\n", 7/fg 0/bg
1105 }