// Lexer for the scripting language
// Supports both Node.js and browser environments
/**
* Token types for the language
*
* @description Defines all token types used by the lexer and parser.
* Each token type represents a distinct syntactic element in the language.
*
* The token types are organized into categories:
* - Literals: NUMBER, STRING, TRUE, FALSE
* - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc.
* - Keywords: WHEN, IS, THEN, FUNCTION, etc.
* - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc.
* - Special: IO_IN, IO_OUT, IO_ASSERT, IO_LISTEN, IO_EMIT, FUNCTION_REF, FUNCTION_ARG
*
* This enumeration provides a centralized definition of all possible
* token types, ensuring consistency between lexer and parser. The token
* types are designed to support the combinator-based architecture where
* all operations are translated to function calls.
*
* @typedef {Object} TokenType
* @property {string} NUMBER - Numeric literals (integers and floats)
* @property {string} PLUS - Addition operator (+)
* @property {string} MINUS - Subtraction operator (-)
* @property {string} MULTIPLY - Multiplication operator (*)
* @property {string} DIVIDE - Division operator (/)
* @property {string} IDENTIFIER - Variable names and function names
* @property {string} ASSIGNMENT - Assignment operator (:)
* @property {string} ARROW - Function arrow (->)
* @property {string} CASE - Case keyword
* @property {string} OF - Of keyword
* @property {string} WHEN - When keyword for pattern matching
* @property {string} IS - Is keyword for pattern matching
* @property {string} THEN - Then keyword for pattern matching
* @property {string} WILDCARD - Wildcard pattern (_)
* @property {string} FUNCTION - Function keyword
* @property {string} LEFT_PAREN - Left parenthesis (()
* @property {string} RIGHT_PAREN - Right parenthesis ())
* @property {string} LEFT_BRACE - Left brace ({)
* @property {string} RIGHT_BRACE - Right brace (})
* @property {string} LEFT_BRACKET - Left bracket ([)
* @property {string} RIGHT_BRACKET - Right bracket (])
* @property {string} SEMICOLON - Semicolon (;)
* @property {string} COMMA - Comma (,)
* @property {string} DOT - Dot (.)
* @property {string} STRING - String literals
* @property {string} TRUE - Boolean true literal
* @property {string} FALSE - Boolean false literal
* @property {string} AND - Logical AND operator
* @property {string} OR - Logical OR operator
* @property {string} XOR - Logical XOR operator
* @property {string} NOT - Logical NOT operator
* @property {string} EQUALS - Equality operator (==)
* @property {string} LESS_THAN - Less than operator (<)
* @property {string} GREATER_THAN - Greater than operator (>)
* @property {string} LESS_EQUAL - Less than or equal operator (<=)
* @property {string} GREATER_EQUAL - Greater than or equal operator (>=)
* @property {string} NOT_EQUAL - Not equal operator (!=)
* @property {string} MODULO - Modulo operator (%)
* @property {string} POWER - Power operator (^)
* @property {string} IO_IN - Input operation (..in)
* @property {string} IO_OUT - Output operation (..out)
* @property {string} IO_ASSERT - Assertion operation (..assert)
* @property {string} IO_LISTEN - Listen operation (..listen)
* @property {string} IO_EMIT - Emit operation (..emit)
* @property {string} FUNCTION_REF - Function reference (@function)
* @property {string} FUNCTION_ARG - Function argument (@(expression))
* @property {string} COMPOSE - Function composition (via)
*/
export const TokenType = {
NUMBER: 'NUMBER',
PLUS: 'PLUS',
MINUS: 'MINUS',
UNARY_MINUS: 'UNARY_MINUS',
BINARY_MINUS: 'BINARY_MINUS',
MULTIPLY: 'MULTIPLY',
DIVIDE: 'DIVIDE',
IDENTIFIER: 'IDENTIFIER',
ASSIGNMENT: 'ASSIGNMENT',
ARROW: 'ARROW',
CASE: 'CASE',
OF: 'OF',
WHEN: 'WHEN',
IS: 'IS',
THEN: 'THEN',
WILDCARD: 'WILDCARD',
FUNCTION: 'FUNCTION',
LEFT_PAREN: 'LEFT_PAREN',
RIGHT_PAREN: 'RIGHT_PAREN',
LEFT_BRACE: 'LEFT_BRACE',
RIGHT_BRACE: 'RIGHT_BRACE',
LEFT_BRACKET: 'LEFT_BRACKET',
RIGHT_BRACKET: 'RIGHT_BRACKET',
SEMICOLON: 'SEMICOLON',
COMMA: 'COMMA',
DOT: 'DOT',
STRING: 'STRING',
TRUE: 'TRUE',
FALSE: 'FALSE',
AND: 'AND',
OR: 'OR',
XOR: 'XOR',
NOT: 'NOT',
EQUALS: 'EQUALS',
LESS_THAN: 'LESS_THAN',
GREATER_THAN: 'GREATER_THAN',
LESS_EQUAL: 'LESS_EQUAL',
GREATER_EQUAL: 'GREATER_EQUAL',
NOT_EQUAL: 'NOT_EQUAL',
MODULO: 'MODULO',
POWER: 'POWER',
IO_IN: 'IO_IN',
IO_OUT: 'IO_OUT',
IO_ASSERT: 'IO_ASSERT',
IO_LISTEN: 'IO_LISTEN',
IO_EMIT: 'IO_EMIT',
FUNCTION_REF: 'FUNCTION_REF',
FUNCTION_ARG: 'FUNCTION_ARG',
COMPOSE: 'COMPOSE'
};
/**
* Token object structure
*
* @typedef {Object} Token
* @property {string} type - The token type from TokenType enum
* @property {*} [value] - The token's value (for literals and identifiers)
* @property {string} [name] - Function name (for FUNCTION_REF tokens)
* @property {number} line - Line number where token appears (1-indexed)
* @property {number} column - Column number where token appears (1-indexed)
*/
/**
* Converts source code into tokens for the combinator-based language
*
* @param {string} input - The source code to tokenize
* @returns {Array.<Token>} Array of token objects with type, value, line, and column
* @throws {Error} For unexpected characters or malformed tokens
*
* @description The lexer performs lexical analysis by converting source code
* into a stream of tokens. Each token represents a meaningful unit of the
* language syntax, such as identifiers, literals, operators, and keywords.
*
* The lexer implements a character-by-character scanning approach with
* lookahead for multi-character tokens. It maintains line and column
* information for accurate error reporting and debugging.
*
* Key features:
* - Handles whitespace and comments (single-line and multi-line)
* - Recognizes all language constructs including operators, keywords, and literals
* - Supports string literals with escape sequences
* - Provides detailed position information for error reporting
* - Cross-platform compatibility (Node.js, Bun, browser)
* - Supports function composition with 'via' keyword
* - Handles function references with '@' operator
*
* The lexer is designed to be robust and provide clear error messages
* for malformed input, making it easier to debug syntax errors in user code.
* It supports the combinator-based architecture by recognizing all operators
* and special tokens needed for function composition and application.
*
* The lexer is the first step in the language processing pipeline and must
* correctly identify all tokens that the parser will translate into function
* calls. This includes operators that will become combinator function calls,
* function references that enable higher-order programming, and special
* keywords that support the functional programming paradigm.
*
* The lexer uses a state machine approach where each character type triggers
* different parsing strategies. This design enables efficient tokenization
* while maintaining clear separation of concerns for different token types.
* The character-by-character approach allows for precise error reporting and
* supports multi-character tokens like operators and string literals
* with escape sequences.
*
* Error handling is designed to provide meaningful feedback by including
* line and column information in error messages. This enables users to
* quickly locate and fix syntax errors in their code.
*/
export function lexer(input) {
const tokens = [];
let current = 0;
let line = 1;
let column = 1;
// Helper functions for spacing detection
function hasLeadingWhitespace() {
let pos = current - 1;
while (pos >= 0 && /\s/.test(input[pos])) pos--;
return pos >= 0 && input[pos] !== '\n' && input[pos] !== ';';
}
function hasLeadingAndTrailingSpaces() {
const hasLeading = current > 0 && /\s/.test(input[current - 1]);
const hasTrailing = current + 1 < input.length && /\s/.test(input[current + 1]);
return hasLeading && hasTrailing;
}
while (current < input.length) {
let char = input[current];
// Skip whitespace
if (/\s/.test(char)) {
if (char === '\n') {
line++;
column = 1;
} else {
column++;
}
current++;
continue;
}
// Skip comments (single line and multi-line)
if (char === '/' && input[current + 1] === '/') {
while (current < input.length && input[current] !== '\n') {
current++;
column++;
}
continue;
}
// Skip multi-line comments /* ... */
if (char === '/' && input[current + 1] === '*') {
current += 2; // Skip /*
column += 2;
while (current < input.length - 1 && !(input[current] === '*' && input[current + 1] === '/')) {
if (input[current] === '\n') {
line++;
column = 1;
} else {
column++;
}
current++;
}
if (current < input.length - 1) {
current += 2; // Skip */
column += 2;
}
continue;
}
// IO operations (..in, ..out, ..assert)
if (char === '.' && input[current + 1] === '.') {
current += 2; // Skip both dots
column += 2;
// Read the IO operation name
let operation = '';
while (current < input.length && /[a-zA-Z]/.test(input[current])) {
operation += input[current];
current++;
column++;
}
// Determine the IO operation type
switch (operation) {
case 'in':
tokens.push({ type: TokenType.IO_IN, line, column: column - operation.length - 2 });
break;
case 'out':
tokens.push({ type: TokenType.IO_OUT, line, column: column - operation.length - 2 });
break;
case 'assert':
tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 });
break;
case 'listen':
tokens.push({ type: TokenType.IO_LISTEN, line, column: column - operation.length - 2 });
break;
case 'emit':
tokens.push({ type: TokenType.IO_EMIT, line, column: column - operation.length - 2 });
break;
default:
throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`);
}
continue;
}
// Function references (@function) and function arguments (@(expression))
if (char === '@') {
current++; // Skip '@'
column++;
// Check if this is @(expression) for function arguments
if (current < input.length && input[current] === '(') {
// This is @(expression) - mark as function argument
tokens.push({ type: TokenType.FUNCTION_ARG, line, column: column - 1 });
continue;
}
// Read the function name
let functionName = '';
while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) {
functionName += input[current];
current++;
column++;
}
if (functionName === '') {
throw new Error(`Invalid function reference at line ${line}, column ${column - 1}`);
}
tokens.push({ type: TokenType.FUNCTION_REF, name: functionName, line, column: column - functionName.length - 1 });
continue;
}
// Numbers
if (/[0-9]/.test(char)) {
let value = '';
while (current < input.length && /[0-9.]/.test(input[current])) {
value += input[current];
current++;
column++;
}
tokens.push({ type: TokenType.NUMBER, value: parseFloat(value), line, column: column - value.length });
continue;
}
// Identifiers and keywords
if (/[a-zA-Z_]/.test(char)) {
let value = '';
const startColumn = column;
while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) {
value += input[current];
current++;
column++;
}
// Check for keywords
switch (value) {
case 'true':
tokens.push({ type: TokenType.TRUE, value: true, line, column: startColumn });
break;
case 'false':
tokens.push({ type: TokenType.FALSE, value: false, line, column: startColumn });
break;
case 'and':
tokens.push({ type: TokenType.AND, line, column: startColumn });
break;
case 'or':
tokens.push({ type: TokenType.OR, line, column: startColumn });
break;
case 'xor':
tokens.push({ type: TokenType.XOR, line, column: startColumn });
break;
case 'not':
tokens.push({ type: TokenType.NOT, line, column: startColumn });
break;
case 'case':
tokens.push({ type: TokenType.CASE, line, column: startColumn });
break;
case 'of':
tokens.push({ type: TokenType.OF, line, column: startColumn });
break;
case 'when':
tokens.push({ type: TokenType.WHEN, line, column: startColumn });
break;
case 'is':
tokens.push({ type: TokenType.IS, line, column: startColumn });
break;
case 'then':
tokens.push({ type: TokenType.THEN, line, column: startColumn });
break;
case 'function':
tokens.push({ type: TokenType.FUNCTION, line, column: startColumn });
break;
case 'via': // Function composition operator: f via g = compose(f, g)
tokens.push({ type: TokenType.COMPOSE, line, column: startColumn });
break;
case '_':
tokens.push({ type: TokenType.WILDCARD, line, column: startColumn });
break;
default:
tokens.push({ type: TokenType.IDENTIFIER, value, line, column: startColumn });
}
continue;
}
// Strings
if (char === '"') {
let value = '';
current++;
column++;
while (current < input.length && input[current] !== '"') {
if (input[current] === '\\') {
current++;
column++;
if (current < input.length) {
switch (input[current]) {
case 'n': value += '\n'; break;
case 't': value += '\t'; break;
case 'r': value += '\r'; break;
case '\\': value += '\\'; break;
case '"': value += '"'; break;
default: value += input[current];
}
}
} else {
value += input[current];
}
current++;
column++;
}
if (current < input.length) {
current++;
column++;
}
tokens.push({ type: TokenType.STRING, value, line, column: column - value.length - 2 });
continue;
}
// Operators and punctuation
switch (char) {
case '+':
tokens.push({ type: TokenType.PLUS, line, column });
break;
case '-':
if (input[current + 1] === '>') {
tokens.push({ type: TokenType.ARROW, line, column });
current++;
column++;
} else {
// Check spacing to determine token type
const isUnary = !hasLeadingWhitespace();
const isBinary = hasLeadingAndTrailingSpaces();
const isFollowedByNumber = current + 1 < input.length && /[0-9]/.test(input[current + 1]);
if (isUnary && isFollowedByNumber) {
// Unary minus at start of expression: -5
tokens.push({ type: TokenType.UNARY_MINUS, line, column });
} else if (isBinary) {
// Binary minus with spaces: 5 - 3
tokens.push({ type: TokenType.BINARY_MINUS, line, column });
} else if (isFollowedByNumber) {
// Minus followed by number but not at start: 5-3 (legacy)
tokens.push({ type: TokenType.MINUS, line, column });
} else {
// Fallback to legacy MINUS token for edge cases
tokens.push({ type: TokenType.MINUS, line, column });
}
}
break;
case '*':
tokens.push({ type: TokenType.MULTIPLY, line, column });
break;
case '/':
tokens.push({ type: TokenType.DIVIDE, line, column });
break;
case '%':
tokens.push({ type: TokenType.MODULO, line, column });
break;
case '^':
tokens.push({ type: TokenType.POWER, line, column });
break;
case '(':
tokens.push({ type: TokenType.LEFT_PAREN, line, column });
break;
case ')':
tokens.push({ type: TokenType.RIGHT_PAREN, line, column });
break;
case '{':
tokens.push({ type: TokenType.LEFT_BRACE, line, column });
break;
case '}':
tokens.push({ type: TokenType.RIGHT_BRACE, line, column });
break;
case '[':
tokens.push({ type: TokenType.LEFT_BRACKET, line, column });
break;
case ']':
tokens.push({ type: TokenType.RIGHT_BRACKET, line, column });
break;
case ';':
tokens.push({ type: TokenType.SEMICOLON, line, column });
break;
case ',':
tokens.push({ type: TokenType.COMMA, line, column });
break;
case '.':
tokens.push({ type: TokenType.DOT, line, column });
break;
case ':':
tokens.push({ type: TokenType.ASSIGNMENT, line, column });
break;
case '=':
if (input[current + 1] === '=') {
tokens.push({ type: TokenType.EQUALS, line, column });
current++;
column++;
} else {
// Single = is used for equality comparison in assertions
tokens.push({ type: TokenType.EQUALS, line, column });
}
break;
case '<':
if (input[current + 1] === '=') {
tokens.push({ type: TokenType.LESS_EQUAL, line, column });
current++;
column++;
} else {
tokens.push({ type: TokenType.LESS_THAN, line, column });
}
break;
case '>':
if (input[current + 1] === '=') {
tokens.push({ type: TokenType.GREATER_EQUAL, line, column });
current++;
column++;
} else {
tokens.push({ type: TokenType.GREATER_THAN, line, column });
}
break;
case '!':
if (input[current + 1] === '=') {
tokens.push({ type: TokenType.NOT_EQUAL, line, column });
current++;
column++;
} else {
throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`);
}
break;
default:
throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`);
}
current++;
column++;
}
return tokens;
}