// Lexer for the scripting language // Supports both Node.js and browser environments /** * Token types for the language * * @description Defines all token types used by the lexer and parser. * Each token type represents a distinct syntactic element in the language. * * The token types are organized into categories: * - Literals: NUMBER, STRING, TRUE, FALSE * - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc. * - Keywords: WHEN, IS, THEN, FUNCTION, etc. * - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc. * - Special: IO_IN, IO_OUT, IO_ASSERT, FUNCTION_REF, FUNCTION_ARG * * This enumeration provides a centralized definition of all possible * token types, ensuring consistency between lexer and parser. The token * types are designed to support the combinator-based architecture where * all operations are translated to function calls. */ export const TokenType = { NUMBER: 'NUMBER', PLUS: 'PLUS', MINUS: 'MINUS', MULTIPLY: 'MULTIPLY', DIVIDE: 'DIVIDE', IDENTIFIER: 'IDENTIFIER', ASSIGNMENT: 'ASSIGNMENT', ARROW: 'ARROW', CASE: 'CASE', OF: 'OF', WHEN: 'WHEN', IS: 'IS', THEN: 'THEN', WILDCARD: 'WILDCARD', FUNCTION: 'FUNCTION', LEFT_PAREN: 'LEFT_PAREN', RIGHT_PAREN: 'RIGHT_PAREN', LEFT_BRACE: 'LEFT_BRACE', RIGHT_BRACE: 'RIGHT_BRACE', LEFT_BRACKET: 'LEFT_BRACKET', RIGHT_BRACKET: 'RIGHT_BRACKET', SEMICOLON: 'SEMICOLON', COMMA: 'COMMA', DOT: 'DOT', STRING: 'STRING', TRUE: 'TRUE', FALSE: 'FALSE', AND: 'AND', OR: 'OR', XOR: 'XOR', NOT: 'NOT', EQUALS: 'EQUALS', LESS_THAN: 'LESS_THAN', GREATER_THAN: 'GREATER_THAN', LESS_EQUAL: 'LESS_EQUAL', GREATER_EQUAL: 'GREATER_EQUAL', NOT_EQUAL: 'NOT_EQUAL', MODULO: 'MODULO', POWER: 'POWER', IO_IN: 'IO_IN', IO_OUT: 'IO_OUT', IO_ASSERT: 'IO_ASSERT', FUNCTION_REF: 'FUNCTION_REF', FUNCTION_ARG: 'FUNCTION_ARG', COMPOSE: 'COMPOSE' }; /** * Converts source code into tokens * * @param {string} input - The source code to tokenize * @returns {Array.} Array of token objects with type, value, line, and column * @throws {Error} For unexpected characters or malformed tokens * * @description The lexer performs lexical analysis by converting source code * into a stream of tokens. Each token represents a meaningful unit of the * language syntax, such as identifiers, literals, operators, and keywords. * * The lexer implements a character-by-character scanning approach with * lookahead for multi-character tokens. It maintains line and column * information for accurate error reporting and debugging. * * Key features: * - Handles whitespace and comments (single-line and multi-line) * - Recognizes all language constructs including operators, keywords, and literals * - Supports string literals with escape sequences * - Provides detailed position information for error reporting * - Cross-platform compatibility (Node.js, Bun, browser) * - Supports function composition with 'via' keyword * - Handles function references with '@' operator * * The lexer is designed to be robust and provide clear error messages * for malformed input, making it easier to debug syntax errors in user code. * It supports the combinator-based architecture by recognizing all operators * and special tokens needed for function composition and application. */ export function lexer(input) { const tokens = []; let current = 0; let line = 1; let column = 1; while (current < input.length) { let char = input[current]; // Skip whitespace if (/\s/.test(char)) { if (char === '\n') { line++; column = 1; } else { column++; } current++; continue; } // Skip comments (single line and multi-line) if (char === '/' && input[current + 1] === '/') { while (current < input.length && input[current] !== '\n') { current++; column++; } continue; } // Skip multi-line comments /* ... */ if (char === '/' && input[current + 1] === '*') { current += 2; // Skip /* column += 2; while (current < input.length - 1 && !(input[current] === '*' && input[current + 1] === '/')) { if (input[current] === '\n') { line++; column = 1; } else { column++; } current++; } if (current < input.length - 1) { current += 2; // Skip */ column += 2; } continue; } // IO operations (..in, ..out, ..assert) if (char === '.' && input[current + 1] === '.') { current += 2; // Skip both dots column += 2; // Read the IO operation name let operation = ''; while (current < input.length && /[a-zA-Z]/.test(input[current])) { operation += input[current]; current++; column++; } // Determine the IO operation type switch (operation) { case 'in': tokens.push({ type: TokenType.IO_IN, line, column: column - operation.length - 2 }); break; case 'out': tokens.push({ type: TokenType.IO_OUT, line, column: column - operation.length - 2 }); break; case 'assert': tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 }); break; default: throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`); } continue; } // Function references (@function) and function arguments (@(expression)) if (char === '@') { current++; // Skip '@' column++; // Check if this is @(expression) for function arguments if (current < input.length && input[current] === '(') { // This is @(expression) - mark as function argument tokens.push({ type: TokenType.FUNCTION_ARG, line, column: column - 1 }); continue; } // Read the function name let functionName = ''; while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) { functionName += input[current]; current++; column++; } if (functionName === '') { throw new Error(`Invalid function reference at line ${line}, column ${column - 1}`); } tokens.push({ type: TokenType.FUNCTION_REF, name: functionName, line, column: column - functionName.length - 1 }); continue; } // Numbers if (/[0-9]/.test(char)) { let value = ''; while (current < input.length && /[0-9.]/.test(input[current])) { value += input[current]; current++; column++; } tokens.push({ type: TokenType.NUMBER, value: parseFloat(value), line, column: column - value.length }); continue; } // Identifiers and keywords if (/[a-zA-Z_]/.test(char)) { let value = ''; const startColumn = column; while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) { value += input[current]; current++; column++; } // Check for keywords switch (value) { case 'true': tokens.push({ type: TokenType.TRUE, value: true, line, column: startColumn }); break; case 'false': tokens.push({ type: TokenType.FALSE, value: false, line, column: startColumn }); break; case 'and': tokens.push({ type: TokenType.AND, line, column: startColumn }); break; case 'or': tokens.push({ type: TokenType.OR, line, column: startColumn }); break; case 'xor': tokens.push({ type: TokenType.XOR, line, column: startColumn }); break; case 'not': tokens.push({ type: TokenType.NOT, line, column: startColumn }); break; case 'case': tokens.push({ type: TokenType.CASE, line, column: startColumn }); break; case 'of': tokens.push({ type: TokenType.OF, line, column: startColumn }); break; case 'when': tokens.push({ type: TokenType.WHEN, line, column: startColumn }); break; case 'is': tokens.push({ type: TokenType.IS, line, column: startColumn }); break; case 'then': tokens.push({ type: TokenType.THEN, line, column: startColumn }); break; case 'function': tokens.push({ type: TokenType.FUNCTION, line, column: startColumn }); break; case 'via': tokens.push({ type: TokenType.COMPOSE, line, column: startColumn }); break; case '_': tokens.push({ type: TokenType.WILDCARD, line, column: startColumn }); break; default: tokens.push({ type: TokenType.IDENTIFIER, value, line, column: startColumn }); } continue; } // Strings if (char === '"') { let value = ''; current++; column++; while (current < input.length && input[current] !== '"') { if (input[current] === '\\') { current++; column++; if (current < input.length) { switch (input[current]) { case 'n': value += '\n'; break; case 't': value += '\t'; break; case 'r': value += '\r'; break; case '\\': value += '\\'; break; case '"': value += '"'; break; default: value += input[current]; } } } else { value += input[current]; } current++; column++; } if (current < input.length) { current++; column++; } tokens.push({ type: TokenType.STRING, value, line, column: column - value.length - 2 }); continue; } // Operators and punctuation switch (char) { case '+': tokens.push({ type: TokenType.PLUS, line, column }); break; case '-': if (input[current + 1] === '>') { tokens.push({ type: TokenType.ARROW, line, column }); current++; column++; } else { tokens.push({ type: TokenType.MINUS, line, column }); } break; case '*': tokens.push({ type: TokenType.MULTIPLY, line, column }); break; case '/': tokens.push({ type: TokenType.DIVIDE, line, column }); break; case '%': tokens.push({ type: TokenType.MODULO, line, column }); break; case '^': tokens.push({ type: TokenType.POWER, line, column }); break; case '(': tokens.push({ type: TokenType.LEFT_PAREN, line, column }); break; case ')': tokens.push({ type: TokenType.RIGHT_PAREN, line, column }); break; case '{': tokens.push({ type: TokenType.LEFT_BRACE, line, column }); break; case '}': tokens.push({ type: TokenType.RIGHT_BRACE, line, column }); break; case '[': tokens.push({ type: TokenType.LEFT_BRACKET, line, column }); break; case ']': tokens.push({ type: TokenType.RIGHT_BRACKET, line, column }); break; case ';': tokens.push({ type: TokenType.SEMICOLON, line, column }); break; case ',': tokens.push({ type: TokenType.COMMA, line, column }); break; case '.': tokens.push({ type: TokenType.DOT, line, column }); break; case ':': tokens.push({ type: TokenType.ASSIGNMENT, line, column }); break; case '=': if (input[current + 1] === '=') { tokens.push({ type: TokenType.EQUALS, line, column }); current++; column++; } else { // Single = is used for equality comparison in assertions tokens.push({ type: TokenType.EQUALS, line, column }); } break; case '<': if (input[current + 1] === '=') { tokens.push({ type: TokenType.LESS_EQUAL, line, column }); current++; column++; } else { tokens.push({ type: TokenType.LESS_THAN, line, column }); } break; case '>': if (input[current + 1] === '=') { tokens.push({ type: TokenType.GREATER_EQUAL, line, column }); current++; column++; } else { tokens.push({ type: TokenType.GREATER_THAN, line, column }); } break; case '!': if (input[current + 1] === '=') { tokens.push({ type: TokenType.NOT_EQUAL, line, column }); current++; column++; } else { throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`); } break; default: throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`); } current++; column++; } return tokens; }