lexer.js - Documentation

// Lexer for the scripting language
// Supports both Node.js and browser environments

/**
 * Token types for the language
 * 
 * @description Defines all token types used by the lexer and parser.
 * Each token type represents a distinct syntactic element in the language.
 * 
 * The token types are organized into categories:
 * - Literals: NUMBER, STRING, TRUE, FALSE
 * - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc.
 * - Keywords: WHEN, IS, THEN, FUNCTION, etc.
 * - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc.
 * - Special: IO_IN, IO_OUT, IO_ASSERT, IO_LISTEN, IO_EMIT, FUNCTION_REF, FUNCTION_ARG
 * 
 * This enumeration provides a centralized definition of all possible
 * token types, ensuring consistency between lexer and parser. The token
 * types are designed to support the combinator-based architecture where
 * all operations are translated to function calls.
 * 
 * @typedef {Object} TokenType
 * @property {string} NUMBER - Numeric literals (integers and floats)
 * @property {string} PLUS - Addition operator (+)
 * @property {string} MINUS - Subtraction operator (-)
 * @property {string} MULTIPLY - Multiplication operator (*)
 * @property {string} DIVIDE - Division operator (/)
 * @property {string} IDENTIFIER - Variable names and function names
 * @property {string} ASSIGNMENT - Assignment operator (:)
 * @property {string} ARROW - Function arrow (->)
 * @property {string} CASE - Case keyword
 * @property {string} OF - Of keyword
 * @property {string} WHEN - When keyword for pattern matching
 * @property {string} IS - Is keyword for pattern matching
 * @property {string} THEN - Then keyword for pattern matching
 * @property {string} WILDCARD - Wildcard pattern (_)
 * @property {string} FUNCTION - Function keyword
 * @property {string} LEFT_PAREN - Left parenthesis (()
 * @property {string} RIGHT_PAREN - Right parenthesis ())
 * @property {string} LEFT_BRACE - Left brace ({)
 * @property {string} RIGHT_BRACE - Right brace (})
 * @property {string} LEFT_BRACKET - Left bracket ([)
 * @property {string} RIGHT_BRACKET - Right bracket (])
 * @property {string} SEMICOLON - Semicolon (;)
 * @property {string} COMMA - Comma (,)
 * @property {string} DOT - Dot (.)
 * @property {string} STRING - String literals
 * @property {string} TRUE - Boolean true literal
 * @property {string} FALSE - Boolean false literal
 * @property {string} AND - Logical AND operator
 * @property {string} OR - Logical OR operator
 * @property {string} XOR - Logical XOR operator
 * @property {string} NOT - Logical NOT operator
 * @property {string} EQUALS - Equality operator (==)
 * @property {string} LESS_THAN - Less than operator (<)
 * @property {string} GREATER_THAN - Greater than operator (>)
 * @property {string} LESS_EQUAL - Less than or equal operator (<=)
 * @property {string} GREATER_EQUAL - Greater than or equal operator (>=)
 * @property {string} NOT_EQUAL - Not equal operator (!=)
 * @property {string} MODULO - Modulo operator (%)
 * @property {string} POWER - Power operator (^)
 * @property {string} IO_IN - Input operation (..in)
 * @property {string} IO_OUT - Output operation (..out)
 * @property {string} IO_ASSERT - Assertion operation (..assert)
 * @property {string} IO_LISTEN - Listen operation (..listen)
 * @property {string} IO_EMIT - Emit operation (..emit)
 * @property {string} FUNCTION_REF - Function reference (@function)
 * @property {string} FUNCTION_ARG - Function argument (@(expression))
 * @property {string} COMPOSE - Function composition (via)
 */
export const TokenType = {
    NUMBER: 'NUMBER',
    PLUS: 'PLUS',
    MINUS: 'MINUS',
    UNARY_MINUS: 'UNARY_MINUS',
    BINARY_MINUS: 'BINARY_MINUS',
    MULTIPLY: 'MULTIPLY',
    DIVIDE: 'DIVIDE',
    IDENTIFIER: 'IDENTIFIER',
    ASSIGNMENT: 'ASSIGNMENT',
    ARROW: 'ARROW',
    CASE: 'CASE',
    OF: 'OF',
    WHEN: 'WHEN',
    IS: 'IS',
    THEN: 'THEN',
    WILDCARD: 'WILDCARD',
    FUNCTION: 'FUNCTION',
    LEFT_PAREN: 'LEFT_PAREN',
    RIGHT_PAREN: 'RIGHT_PAREN',
    LEFT_BRACE: 'LEFT_BRACE',
    RIGHT_BRACE: 'RIGHT_BRACE',
    LEFT_BRACKET: 'LEFT_BRACKET',
    RIGHT_BRACKET: 'RIGHT_BRACKET',
    SEMICOLON: 'SEMICOLON',
    COMMA: 'COMMA',
    DOT: 'DOT',
    STRING: 'STRING',
    TRUE: 'TRUE',
    FALSE: 'FALSE',
    AND: 'AND',
    OR: 'OR',
    XOR: 'XOR',
    NOT: 'NOT',
    EQUALS: 'EQUALS',
    LESS_THAN: 'LESS_THAN',
    GREATER_THAN: 'GREATER_THAN',
    LESS_EQUAL: 'LESS_EQUAL',
    GREATER_EQUAL: 'GREATER_EQUAL',
    NOT_EQUAL: 'NOT_EQUAL',
    MODULO: 'MODULO',
    POWER: 'POWER',
    IO_IN: 'IO_IN',
    IO_OUT: 'IO_OUT',
    IO_ASSERT: 'IO_ASSERT',
    IO_LISTEN: 'IO_LISTEN',
    IO_EMIT: 'IO_EMIT',
    FUNCTION_REF: 'FUNCTION_REF',
    FUNCTION_ARG: 'FUNCTION_ARG',
    COMPOSE: 'COMPOSE'
};

/**
 * Token object structure
 * 
 * @typedef {Object} Token
 * @property {string} type - The token type from TokenType enum
 * @property {*} [value] - The token's value (for literals and identifiers)
 * @property {string} [name] - Function name (for FUNCTION_REF tokens)
 * @property {number} line - Line number where token appears (1-indexed)
 * @property {number} column - Column number where token appears (1-indexed)
 */

/**
 * Converts source code into tokens for the combinator-based language
 * 
 * @param {string} input - The source code to tokenize
 * @returns {Array.<Token>} Array of token objects with type, value, line, and column
 * @throws {Error} For unexpected characters or malformed tokens
 * 
 * @description The lexer performs lexical analysis by converting source code
 * into a stream of tokens. Each token represents a meaningful unit of the
 * language syntax, such as identifiers, literals, operators, and keywords.
 * 
 * The lexer implements a character-by-character scanning approach with
 * lookahead for multi-character tokens. It maintains line and column
 * information for accurate error reporting and debugging.
 * 
 * Key features:
 * - Handles whitespace and comments (single-line and multi-line)
 * - Recognizes all language constructs including operators, keywords, and literals
 * - Supports string literals with escape sequences
 * - Provides detailed position information for error reporting
 * - Cross-platform compatibility (Node.js, Bun, browser)
 * - Supports function composition with 'via' keyword
 * - Handles function references with '@' operator
 * 
 * The lexer is designed to be robust and provide clear error messages
 * for malformed input, making it easier to debug syntax errors in user code.
 * It supports the combinator-based architecture by recognizing all operators
 * and special tokens needed for function composition and application.
 * 
 * The lexer is the first step in the language processing pipeline and must
 * correctly identify all tokens that the parser will translate into function
 * calls. This includes operators that will become combinator function calls,
 * function references that enable higher-order programming, and special
 * keywords that support the functional programming paradigm.
 * 
 * The lexer uses a state machine approach where each character type triggers
 * different parsing strategies. This design enables efficient tokenization
 * while maintaining clear separation of concerns for different token types.
 * The character-by-character approach allows for precise error reporting and
 * supports multi-character tokens like operators and string literals
 * with escape sequences.
 * 
 * Error handling is designed to provide meaningful feedback by including
 * line and column information in error messages. This enables users to
 * quickly locate and fix syntax errors in their code.
 */
export function lexer(input) {
    const tokens = [];
    let current = 0;
    let line = 1;
    let column = 1;

    // Helper functions for spacing detection
    function hasLeadingWhitespace() {
        let pos = current - 1;
        while (pos >= 0 && /\s/.test(input[pos])) pos--;
        return pos >= 0 && input[pos] !== '\n' && input[pos] !== ';';
    }

    function hasLeadingAndTrailingSpaces() {
        const hasLeading = current > 0 && /\s/.test(input[current - 1]);
        const hasTrailing = current + 1 < input.length && /\s/.test(input[current + 1]);
        return hasLeading && hasTrailing;
    }

    while (current < input.length) {
        let char = input[current];

        // Skip whitespace
        if (/\s/.test(char)) {
            if (char === '\n') {
                line++;
                column = 1;
            } else {
                column++;
            }
            current++;
            continue;
        }

        // Skip comments (single line and multi-line)
        if (char === '/' && input[current + 1] === '/') {
            while (current < input.length && input[current] !== '\n') {
                current++;
                column++;
            }
            continue;
        }
        
        // Skip multi-line comments /* ... */
        if (char === '/' && input[current + 1] === '*') {
            current += 2; // Skip /*
            column += 2;
            while (current < input.length - 1 && !(input[current] === '*' && input[current + 1] === '/')) {
                if (input[current] === '\n') {
                    line++;
                    column = 1;
                } else {
                    column++;
                }
                current++;
            }
            if (current < input.length - 1) {
                current += 2; // Skip */
                column += 2;
            }
            continue;
        }

        // IO operations (..in, ..out, ..assert)
        if (char === '.' && input[current + 1] === '.') {
            current += 2; // Skip both dots
            column += 2;
            
            // Read the IO operation name
            let operation = '';
            while (current < input.length && /[a-zA-Z]/.test(input[current])) {
                operation += input[current];
                current++;
                column++;
            }
            
            // Determine the IO operation type
            switch (operation) {
                case 'in':
                    tokens.push({ type: TokenType.IO_IN, line, column: column - operation.length - 2 });
                    break;
                case 'out':
                    tokens.push({ type: TokenType.IO_OUT, line, column: column - operation.length - 2 });
                    break;
                case 'assert':
                    tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 });
                    break;
                case 'listen':
                    tokens.push({ type: TokenType.IO_LISTEN, line, column: column - operation.length - 2 });
                    break;
                case 'emit':
                    tokens.push({ type: TokenType.IO_EMIT, line, column: column - operation.length - 2 });
                    break;
                default:
                    throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`);
            }
            continue;
        }
        
        // Function references (@function) and function arguments (@(expression))
        if (char === '@') {
            current++; // Skip '@'
            column++;
            
            // Check if this is @(expression) for function arguments
            if (current < input.length && input[current] === '(') {
                // This is @(expression) - mark as function argument
                tokens.push({ type: TokenType.FUNCTION_ARG, line, column: column - 1 });
                continue;
            }
            
            // Read the function name
            let functionName = '';
            while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) {
                functionName += input[current];
                current++;
                column++;
            }
            
            if (functionName === '') {
                throw new Error(`Invalid function reference at line ${line}, column ${column - 1}`);
            }
            
            tokens.push({ type: TokenType.FUNCTION_REF, name: functionName, line, column: column - functionName.length - 1 });
            continue;
        }

        // Numbers
        if (/[0-9]/.test(char)) {
            let value = '';
            while (current < input.length && /[0-9.]/.test(input[current])) {
                value += input[current];
                current++;
                column++;
            }
            tokens.push({ type: TokenType.NUMBER, value: parseFloat(value), line, column: column - value.length });
            continue;
        }

        // Identifiers and keywords
        if (/[a-zA-Z_]/.test(char)) {
            let value = '';
            const startColumn = column;
            while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) {
                value += input[current];
                current++;
                column++;
            }

            // Check for keywords
            switch (value) {
                case 'true':
                    tokens.push({ type: TokenType.TRUE, value: true, line, column: startColumn });
                    break;
                case 'false':
                    tokens.push({ type: TokenType.FALSE, value: false, line, column: startColumn });
                    break;
                case 'and':
                    tokens.push({ type: TokenType.AND, line, column: startColumn });
                    break;
                case 'or':
                    tokens.push({ type: TokenType.OR, line, column: startColumn });
                    break;
                case 'xor':
                    tokens.push({ type: TokenType.XOR, line, column: startColumn });
                    break;
                case 'not':
                    tokens.push({ type: TokenType.NOT, line, column: startColumn });
                    break;
                case 'case':
                    tokens.push({ type: TokenType.CASE, line, column: startColumn });
                    break;
                case 'of':
                    tokens.push({ type: TokenType.OF, line, column: startColumn });
                    break;
                case 'when':
                    tokens.push({ type: TokenType.WHEN, line, column: startColumn });
                    break;
                case 'is':
                    tokens.push({ type: TokenType.IS, line, column: startColumn });
                    break;
                case 'then':
                    tokens.push({ type: TokenType.THEN, line, column: startColumn });
                    break;
                case 'function':
                    tokens.push({ type: TokenType.FUNCTION, line, column: startColumn });
                    break;
                case 'via': // Function composition operator: f via g = compose(f, g)
                    tokens.push({ type: TokenType.COMPOSE, line, column: startColumn });
                    break;
                case '_':
                    tokens.push({ type: TokenType.WILDCARD, line, column: startColumn });
                    break;
                default:
                    tokens.push({ type: TokenType.IDENTIFIER, value, line, column: startColumn });
            }
            continue;
        }

        // Strings
        if (char === '"') {
            let value = '';
            current++;
            column++;
            while (current < input.length && input[current] !== '"') {
                if (input[current] === '\\') {
                    current++;
                    column++;
                    if (current < input.length) {
                        switch (input[current]) {
                            case 'n': value += '\n'; break;
                            case 't': value += '\t'; break;
                            case 'r': value += '\r'; break;
                            case '\\': value += '\\'; break;
                            case '"': value += '"'; break;
                            default: value += input[current];
                        }
                    }
                } else {
                    value += input[current];
                }
                current++;
                column++;
            }
            if (current < input.length) {
                current++;
                column++;
            }
            tokens.push({ type: TokenType.STRING, value, line, column: column - value.length - 2 });
            continue;
        }

        // Operators and punctuation
        switch (char) {
            case '+':
                tokens.push({ type: TokenType.PLUS, line, column });
                break;
            case '-':
                if (input[current + 1] === '>') {
                    tokens.push({ type: TokenType.ARROW, line, column });
                    current++;
                    column++;
                } else {
                    // Check spacing to determine token type
                    const isUnary = !hasLeadingWhitespace();
                    const isBinary = hasLeadingAndTrailingSpaces();
                    const isFollowedByNumber = current + 1 < input.length && /[0-9]/.test(input[current + 1]);
                    
                    if (isUnary && isFollowedByNumber) {
                        // Unary minus at start of expression: -5
                        tokens.push({ type: TokenType.UNARY_MINUS, line, column });
                    } else if (isBinary) {
                        // Binary minus with spaces: 5 - 3
                        tokens.push({ type: TokenType.BINARY_MINUS, line, column });
                    } else if (isFollowedByNumber) {
                        // Minus followed by number but not at start: 5-3 (legacy)
                        tokens.push({ type: TokenType.MINUS, line, column });
                    } else {
                        // Fallback to legacy MINUS token for edge cases
                        tokens.push({ type: TokenType.MINUS, line, column });
                    }
                }
                break;
            case '*':
                tokens.push({ type: TokenType.MULTIPLY, line, column });
                break;
            case '/':
                tokens.push({ type: TokenType.DIVIDE, line, column });
                break;
            case '%':
                tokens.push({ type: TokenType.MODULO, line, column });
                break;
            case '^':
                tokens.push({ type: TokenType.POWER, line, column });
                break;
            case '(':
                tokens.push({ type: TokenType.LEFT_PAREN, line, column });
                break;
            case ')':
                tokens.push({ type: TokenType.RIGHT_PAREN, line, column });
                break;
            case '{':
                tokens.push({ type: TokenType.LEFT_BRACE, line, column });
                break;
            case '}':
                tokens.push({ type: TokenType.RIGHT_BRACE, line, column });
                break;
            case '[':
                tokens.push({ type: TokenType.LEFT_BRACKET, line, column });
                break;
            case ']':
                tokens.push({ type: TokenType.RIGHT_BRACKET, line, column });
                break;
            case ';':
                tokens.push({ type: TokenType.SEMICOLON, line, column });
                break;
            case ',':
                tokens.push({ type: TokenType.COMMA, line, column });
                break;
            case '.':
                tokens.push({ type: TokenType.DOT, line, column });
                break;
            case ':':
                tokens.push({ type: TokenType.ASSIGNMENT, line, column });
                break;

            case '=':
                if (input[current + 1] === '=') {
                    tokens.push({ type: TokenType.EQUALS, line, column });
                    current++;
                    column++;
                } else {
                    // Single = is used for equality comparison in assertions
                    tokens.push({ type: TokenType.EQUALS, line, column });
                }
                break;
            case '<':
                if (input[current + 1] === '=') {
                    tokens.push({ type: TokenType.LESS_EQUAL, line, column });
                    current++;
                    column++;
                } else {
                    tokens.push({ type: TokenType.LESS_THAN, line, column });
                }
                break;
            case '>':
                if (input[current + 1] === '=') {
                    tokens.push({ type: TokenType.GREATER_EQUAL, line, column });
                    current++;
                    column++;
                } else {
                    tokens.push({ type: TokenType.GREATER_THAN, line, column });
                }
                break;
            case '!':
                if (input[current + 1] === '=') {
                    tokens.push({ type: TokenType.NOT_EQUAL, line, column });
                    current++;
                    column++;
                } else {
                    throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`);
                }
                break;
            default:
                throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`);
        }

        current++;
        column++;
    }

    return tokens;
}