about summary refs log tree commit diff stats
path: root/js/scripting-lang/lexer.js
diff options
context:
space:
mode:
Diffstat (limited to 'js/scripting-lang/lexer.js')
-rw-r--r--js/scripting-lang/lexer.js127
1 files changed, 122 insertions, 5 deletions
diff --git a/js/scripting-lang/lexer.js b/js/scripting-lang/lexer.js
index 4c50b6e..775229a 100644
--- a/js/scripting-lang/lexer.js
+++ b/js/scripting-lang/lexer.js
@@ -12,17 +12,68 @@
  * - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc.
  * - Keywords: WHEN, IS, THEN, FUNCTION, etc.
  * - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc.
- * - Special: IO_IN, IO_OUT, IO_ASSERT, FUNCTION_REF, FUNCTION_ARG
+ * - Special: IO_IN, IO_OUT, IO_ASSERT, IO_LISTEN, IO_EMIT, FUNCTION_REF, FUNCTION_ARG
  * 
  * This enumeration provides a centralized definition of all possible
  * token types, ensuring consistency between lexer and parser. The token
  * types are designed to support the combinator-based architecture where
  * all operations are translated to function calls.
+ * 
+ * @typedef {Object} TokenType
+ * @property {string} NUMBER - Numeric literals (integers and floats)
+ * @property {string} PLUS - Addition operator (+)
+ * @property {string} MINUS - Subtraction operator (-)
+ * @property {string} MULTIPLY - Multiplication operator (*)
+ * @property {string} DIVIDE - Division operator (/)
+ * @property {string} IDENTIFIER - Variable names and function names
+ * @property {string} ASSIGNMENT - Assignment operator (:)
+ * @property {string} ARROW - Function arrow (->)
+ * @property {string} CASE - Case keyword
+ * @property {string} OF - Of keyword
+ * @property {string} WHEN - When keyword for pattern matching
+ * @property {string} IS - Is keyword for pattern matching
+ * @property {string} THEN - Then keyword for pattern matching
+ * @property {string} WILDCARD - Wildcard pattern (_)
+ * @property {string} FUNCTION - Function keyword
+ * @property {string} LEFT_PAREN - Left parenthesis (()
+ * @property {string} RIGHT_PAREN - Right parenthesis ())
+ * @property {string} LEFT_BRACE - Left brace ({)
+ * @property {string} RIGHT_BRACE - Right brace (})
+ * @property {string} LEFT_BRACKET - Left bracket ([)
+ * @property {string} RIGHT_BRACKET - Right bracket (])
+ * @property {string} SEMICOLON - Semicolon (;)
+ * @property {string} COMMA - Comma (,)
+ * @property {string} DOT - Dot (.)
+ * @property {string} STRING - String literals
+ * @property {string} TRUE - Boolean true literal
+ * @property {string} FALSE - Boolean false literal
+ * @property {string} AND - Logical AND operator
+ * @property {string} OR - Logical OR operator
+ * @property {string} XOR - Logical XOR operator
+ * @property {string} NOT - Logical NOT operator
+ * @property {string} EQUALS - Equality operator (==)
+ * @property {string} LESS_THAN - Less than operator (<)
+ * @property {string} GREATER_THAN - Greater than operator (>)
+ * @property {string} LESS_EQUAL - Less than or equal operator (<=)
+ * @property {string} GREATER_EQUAL - Greater than or equal operator (>=)
+ * @property {string} NOT_EQUAL - Not equal operator (!=)
+ * @property {string} MODULO - Modulo operator (%)
+ * @property {string} POWER - Power operator (^)
+ * @property {string} IO_IN - Input operation (..in)
+ * @property {string} IO_OUT - Output operation (..out)
+ * @property {string} IO_ASSERT - Assertion operation (..assert)
+ * @property {string} IO_LISTEN - Listen operation (..listen)
+ * @property {string} IO_EMIT - Emit operation (..emit)
+ * @property {string} FUNCTION_REF - Function reference (@function)
+ * @property {string} FUNCTION_ARG - Function argument (@(expression))
+ * @property {string} COMPOSE - Function composition (via)
  */
 export const TokenType = {
     NUMBER: 'NUMBER',
     PLUS: 'PLUS',
     MINUS: 'MINUS',
+    UNARY_MINUS: 'UNARY_MINUS',
+    BINARY_MINUS: 'BINARY_MINUS',
     MULTIPLY: 'MULTIPLY',
     DIVIDE: 'DIVIDE',
     IDENTIFIER: 'IDENTIFIER',
@@ -62,16 +113,29 @@ export const TokenType = {
     IO_IN: 'IO_IN',
     IO_OUT: 'IO_OUT',
     IO_ASSERT: 'IO_ASSERT',
+    IO_LISTEN: 'IO_LISTEN',
+    IO_EMIT: 'IO_EMIT',
     FUNCTION_REF: 'FUNCTION_REF',
     FUNCTION_ARG: 'FUNCTION_ARG',
     COMPOSE: 'COMPOSE'
 };
 
 /**
- * Converts source code into tokens
+ * Token object structure
+ * 
+ * @typedef {Object} Token
+ * @property {string} type - The token type from TokenType enum
+ * @property {*} [value] - The token's value (for literals and identifiers)
+ * @property {string} [name] - Function name (for FUNCTION_REF tokens)
+ * @property {number} line - Line number where token appears (1-indexed)
+ * @property {number} column - Column number where token appears (1-indexed)
+ */
+
+/**
+ * Converts source code into tokens for the combinator-based language
  * 
  * @param {string} input - The source code to tokenize
- * @returns {Array.<Object>} Array of token objects with type, value, line, and column
+ * @returns {Array.<Token>} Array of token objects with type, value, line, and column
  * @throws {Error} For unexpected characters or malformed tokens
  * 
  * @description The lexer performs lexical analysis by converting source code
@@ -95,6 +159,23 @@ export const TokenType = {
  * for malformed input, making it easier to debug syntax errors in user code.
  * It supports the combinator-based architecture by recognizing all operators
  * and special tokens needed for function composition and application.
+ * 
+ * The lexer is the first step in the language processing pipeline and must
+ * correctly identify all tokens that the parser will translate into function
+ * calls. This includes operators that will become combinator function calls,
+ * function references that enable higher-order programming, and special
+ * keywords that support the functional programming paradigm.
+ * 
+ * The lexer uses a state machine approach where each character type triggers
+ * different parsing strategies. This design enables efficient tokenization
+ * while maintaining clear separation of concerns for different token types.
+ * The character-by-character approach allows for precise error reporting and
+ * supports multi-character tokens like operators and string literals
+ * with escape sequences.
+ * 
+ * Error handling is designed to provide meaningful feedback by including
+ * line and column information in error messages. This enables users to
+ * quickly locate and fix syntax errors in their code.
  */
 export function lexer(input) {
     const tokens = [];
@@ -102,6 +183,19 @@ export function lexer(input) {
     let line = 1;
     let column = 1;
 
+    // Helper functions for spacing detection
+    function hasLeadingWhitespace() {
+        let pos = current - 1;
+        while (pos >= 0 && /\s/.test(input[pos])) pos--;
+        return pos >= 0 && input[pos] !== '\n' && input[pos] !== ';';
+    }
+
+    function hasLeadingAndTrailingSpaces() {
+        const hasLeading = current > 0 && /\s/.test(input[current - 1]);
+        const hasTrailing = current + 1 < input.length && /\s/.test(input[current + 1]);
+        return hasLeading && hasTrailing;
+    }
+
     while (current < input.length) {
         let char = input[current];
 
@@ -170,6 +264,12 @@ export function lexer(input) {
                 case 'assert':
                     tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 });
                     break;
+                case 'listen':
+                    tokens.push({ type: TokenType.IO_LISTEN, line, column: column - operation.length - 2 });
+                    break;
+                case 'emit':
+                    tokens.push({ type: TokenType.IO_EMIT, line, column: column - operation.length - 2 });
+                    break;
                 default:
                     throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`);
             }
@@ -264,7 +364,7 @@ export function lexer(input) {
                 case 'function':
                     tokens.push({ type: TokenType.FUNCTION, line, column: startColumn });
                     break;
-                case 'via':
+                case 'via': // Function composition operator: f via g = compose(f, g)
                     tokens.push({ type: TokenType.COMPOSE, line, column: startColumn });
                     break;
                 case '_':
@@ -320,7 +420,24 @@ export function lexer(input) {
                     current++;
                     column++;
                 } else {
-                    tokens.push({ type: TokenType.MINUS, line, column });
+                    // Check spacing to determine token type
+                    const isUnary = !hasLeadingWhitespace();
+                    const isBinary = hasLeadingAndTrailingSpaces();
+                    const isFollowedByNumber = current + 1 < input.length && /[0-9]/.test(input[current + 1]);
+                    
+                    if (isUnary && isFollowedByNumber) {
+                        // Unary minus at start of expression: -5
+                        tokens.push({ type: TokenType.UNARY_MINUS, line, column });
+                    } else if (isBinary) {
+                        // Binary minus with spaces: 5 - 3
+                        tokens.push({ type: TokenType.BINARY_MINUS, line, column });
+                    } else if (isFollowedByNumber) {
+                        // Minus followed by number but not at start: 5-3 (legacy)
+                        tokens.push({ type: TokenType.MINUS, line, column });
+                    } else {
+                        // Fallback to legacy MINUS token for edge cases
+                        tokens.push({ type: TokenType.MINUS, line, column });
+                    }
                 }
                 break;
             case '*':