diff options
Diffstat (limited to 'js/scripting-lang/lexer.js')
-rw-r--r-- | js/scripting-lang/lexer.js | 127 |
1 files changed, 122 insertions, 5 deletions
diff --git a/js/scripting-lang/lexer.js b/js/scripting-lang/lexer.js index 4c50b6e..775229a 100644 --- a/js/scripting-lang/lexer.js +++ b/js/scripting-lang/lexer.js @@ -12,17 +12,68 @@ * - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc. * - Keywords: WHEN, IS, THEN, FUNCTION, etc. * - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc. - * - Special: IO_IN, IO_OUT, IO_ASSERT, FUNCTION_REF, FUNCTION_ARG + * - Special: IO_IN, IO_OUT, IO_ASSERT, IO_LISTEN, IO_EMIT, FUNCTION_REF, FUNCTION_ARG * * This enumeration provides a centralized definition of all possible * token types, ensuring consistency between lexer and parser. The token * types are designed to support the combinator-based architecture where * all operations are translated to function calls. + * + * @typedef {Object} TokenType + * @property {string} NUMBER - Numeric literals (integers and floats) + * @property {string} PLUS - Addition operator (+) + * @property {string} MINUS - Subtraction operator (-) + * @property {string} MULTIPLY - Multiplication operator (*) + * @property {string} DIVIDE - Division operator (/) + * @property {string} IDENTIFIER - Variable names and function names + * @property {string} ASSIGNMENT - Assignment operator (:) + * @property {string} ARROW - Function arrow (->) + * @property {string} CASE - Case keyword + * @property {string} OF - Of keyword + * @property {string} WHEN - When keyword for pattern matching + * @property {string} IS - Is keyword for pattern matching + * @property {string} THEN - Then keyword for pattern matching + * @property {string} WILDCARD - Wildcard pattern (_) + * @property {string} FUNCTION - Function keyword + * @property {string} LEFT_PAREN - Left parenthesis (() + * @property {string} RIGHT_PAREN - Right parenthesis ()) + * @property {string} LEFT_BRACE - Left brace ({) + * @property {string} RIGHT_BRACE - Right brace (}) + * @property {string} LEFT_BRACKET - Left bracket ([) + * @property {string} RIGHT_BRACKET - Right bracket (]) + * @property {string} SEMICOLON - Semicolon (;) + * @property {string} COMMA - Comma (,) + * @property {string} DOT - Dot (.) + * @property {string} STRING - String literals + * @property {string} TRUE - Boolean true literal + * @property {string} FALSE - Boolean false literal + * @property {string} AND - Logical AND operator + * @property {string} OR - Logical OR operator + * @property {string} XOR - Logical XOR operator + * @property {string} NOT - Logical NOT operator + * @property {string} EQUALS - Equality operator (==) + * @property {string} LESS_THAN - Less than operator (<) + * @property {string} GREATER_THAN - Greater than operator (>) + * @property {string} LESS_EQUAL - Less than or equal operator (<=) + * @property {string} GREATER_EQUAL - Greater than or equal operator (>=) + * @property {string} NOT_EQUAL - Not equal operator (!=) + * @property {string} MODULO - Modulo operator (%) + * @property {string} POWER - Power operator (^) + * @property {string} IO_IN - Input operation (..in) + * @property {string} IO_OUT - Output operation (..out) + * @property {string} IO_ASSERT - Assertion operation (..assert) + * @property {string} IO_LISTEN - Listen operation (..listen) + * @property {string} IO_EMIT - Emit operation (..emit) + * @property {string} FUNCTION_REF - Function reference (@function) + * @property {string} FUNCTION_ARG - Function argument (@(expression)) + * @property {string} COMPOSE - Function composition (via) */ export const TokenType = { NUMBER: 'NUMBER', PLUS: 'PLUS', MINUS: 'MINUS', + UNARY_MINUS: 'UNARY_MINUS', + BINARY_MINUS: 'BINARY_MINUS', MULTIPLY: 'MULTIPLY', DIVIDE: 'DIVIDE', IDENTIFIER: 'IDENTIFIER', @@ -62,16 +113,29 @@ export const TokenType = { IO_IN: 'IO_IN', IO_OUT: 'IO_OUT', IO_ASSERT: 'IO_ASSERT', + IO_LISTEN: 'IO_LISTEN', + IO_EMIT: 'IO_EMIT', FUNCTION_REF: 'FUNCTION_REF', FUNCTION_ARG: 'FUNCTION_ARG', COMPOSE: 'COMPOSE' }; /** - * Converts source code into tokens + * Token object structure + * + * @typedef {Object} Token + * @property {string} type - The token type from TokenType enum + * @property {*} [value] - The token's value (for literals and identifiers) + * @property {string} [name] - Function name (for FUNCTION_REF tokens) + * @property {number} line - Line number where token appears (1-indexed) + * @property {number} column - Column number where token appears (1-indexed) + */ + +/** + * Converts source code into tokens for the combinator-based language * * @param {string} input - The source code to tokenize - * @returns {Array.<Object>} Array of token objects with type, value, line, and column + * @returns {Array.<Token>} Array of token objects with type, value, line, and column * @throws {Error} For unexpected characters or malformed tokens * * @description The lexer performs lexical analysis by converting source code @@ -95,6 +159,23 @@ export const TokenType = { * for malformed input, making it easier to debug syntax errors in user code. * It supports the combinator-based architecture by recognizing all operators * and special tokens needed for function composition and application. + * + * The lexer is the first step in the language processing pipeline and must + * correctly identify all tokens that the parser will translate into function + * calls. This includes operators that will become combinator function calls, + * function references that enable higher-order programming, and special + * keywords that support the functional programming paradigm. + * + * The lexer uses a state machine approach where each character type triggers + * different parsing strategies. This design enables efficient tokenization + * while maintaining clear separation of concerns for different token types. + * The character-by-character approach allows for precise error reporting and + * supports multi-character tokens like operators and string literals + * with escape sequences. + * + * Error handling is designed to provide meaningful feedback by including + * line and column information in error messages. This enables users to + * quickly locate and fix syntax errors in their code. */ export function lexer(input) { const tokens = []; @@ -102,6 +183,19 @@ export function lexer(input) { let line = 1; let column = 1; + // Helper functions for spacing detection + function hasLeadingWhitespace() { + let pos = current - 1; + while (pos >= 0 && /\s/.test(input[pos])) pos--; + return pos >= 0 && input[pos] !== '\n' && input[pos] !== ';'; + } + + function hasLeadingAndTrailingSpaces() { + const hasLeading = current > 0 && /\s/.test(input[current - 1]); + const hasTrailing = current + 1 < input.length && /\s/.test(input[current + 1]); + return hasLeading && hasTrailing; + } + while (current < input.length) { let char = input[current]; @@ -170,6 +264,12 @@ export function lexer(input) { case 'assert': tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 }); break; + case 'listen': + tokens.push({ type: TokenType.IO_LISTEN, line, column: column - operation.length - 2 }); + break; + case 'emit': + tokens.push({ type: TokenType.IO_EMIT, line, column: column - operation.length - 2 }); + break; default: throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`); } @@ -264,7 +364,7 @@ export function lexer(input) { case 'function': tokens.push({ type: TokenType.FUNCTION, line, column: startColumn }); break; - case 'via': + case 'via': // Function composition operator: f via g = compose(f, g) tokens.push({ type: TokenType.COMPOSE, line, column: startColumn }); break; case '_': @@ -320,7 +420,24 @@ export function lexer(input) { current++; column++; } else { - tokens.push({ type: TokenType.MINUS, line, column }); + // Check spacing to determine token type + const isUnary = !hasLeadingWhitespace(); + const isBinary = hasLeadingAndTrailingSpaces(); + const isFollowedByNumber = current + 1 < input.length && /[0-9]/.test(input[current + 1]); + + if (isUnary && isFollowedByNumber) { + // Unary minus at start of expression: -5 + tokens.push({ type: TokenType.UNARY_MINUS, line, column }); + } else if (isBinary) { + // Binary minus with spaces: 5 - 3 + tokens.push({ type: TokenType.BINARY_MINUS, line, column }); + } else if (isFollowedByNumber) { + // Minus followed by number but not at start: 5-3 (legacy) + tokens.push({ type: TokenType.MINUS, line, column }); + } else { + // Fallback to legacy MINUS token for edge cases + tokens.push({ type: TokenType.MINUS, line, column }); + } } break; case '*': |