diff options
Diffstat (limited to 'js/scripting-lang/parser.js')
-rw-r--r-- | js/scripting-lang/parser.js | 893 |
1 files changed, 893 insertions, 0 deletions
diff --git a/js/scripting-lang/parser.js b/js/scripting-lang/parser.js new file mode 100644 index 0000000..b1aa77f --- /dev/null +++ b/js/scripting-lang/parser.js @@ -0,0 +1,893 @@ +// Parser for the scripting language +// Exports: parser(tokens) +// Converts tokens to an Abstract Syntax Tree (AST) + +import { TokenType } from './lexer.js'; + +/** + * Parser: Converts tokens to an Abstract Syntax Tree (AST). + * + * @param {Array.<Object>} tokens - Array of tokens from the lexer + * @returns {Object} Abstract Syntax Tree with program body + * @throws {Error} For parsing errors like unexpected tokens or missing delimiters + * + * @description The parser implements a combinator-based architecture where all + * operators are translated to function calls to standard library combinators. + * This eliminates parsing ambiguity while preserving the original syntax. + * + * The parser uses a recursive descent approach with proper operator precedence + * handling. Each operator expression (e.g., x + y) is translated to a FunctionCall + * node (e.g., add(x, y)) that will be executed by the interpreter using the + * corresponding combinator function. + * + * Key architectural decisions: + * - All operators become FunctionCall nodes to eliminate ambiguity + * - Operator precedence is handled through recursive parsing functions + * - Function calls are detected by looking for identifiers followed by expressions + * - When expressions and case patterns are parsed with special handling + * - Table literals and access are parsed as structured data + * + * The parser maintains a current token index and advances through the token + * stream, building the AST bottom-up from primary expressions to complex + * logical expressions. + */ +export function parser(tokens) { + let current = 0; + + /** + * Main parsing function that processes the entire token stream + * + * @returns {Object} Complete AST with program body + * @description Iterates through all tokens, parsing each statement or expression + * and building the program body. Handles empty programs gracefully. + */ + function parse() { + const body = []; + + while (current < tokens.length) { + const node = walk(); + if (node) { + body.push(node); + } + } + + return { type: 'Program', body }; + } + + /** + * Main walk function that dispatches to appropriate parsing functions + * + * @returns {Object|null} Parsed AST node or null for empty statements + * @description Determines the type of construct at the current position + * and delegates to the appropriate parsing function. The order of checks + * determines parsing precedence for top-level constructs. + * + * Parsing order: + * 1. IO operations (highest precedence for top-level constructs) + * 2. Assignments (identifier followed by assignment token) + * 3. When expressions (pattern matching) + * 4. Function definitions (explicit function declarations) + * 5. Logical expressions (default case for all other expressions) + */ + function walk() { + const token = tokens[current]; + + if (!token) return null; + + // Handle IO operations first + if (token.type === TokenType.IO_IN) { + return parseIOIn(); + } + if (token.type === TokenType.IO_OUT) { + return parseIOOut(); + } + if (token.type === TokenType.IO_ASSERT) { + return parseIOAssert(); + } + + // Handle assignments + if (token.type === TokenType.IDENTIFIER && + current + 1 < tokens.length && + tokens[current + 1].type === TokenType.ASSIGNMENT) { + return parseAssignment(); + } + + // Handle when expressions + if (token.type === TokenType.WHEN) { + return parseWhenExpression(); + } + + // Handle function definitions + if (token.type === TokenType.FUNCTION) { + return parseFunctionDefinition(); + } + + + + // For all other expressions, parse as logical expressions + return parseLogicalExpression(); + } + + /** + * Parse assignment statements: identifier : expression; + * + * @returns {Object} Assignment AST node + * @throws {Error} For malformed assignments or missing semicolons + * @description Parses variable assignments and function definitions. + * Supports both simple assignments (x : 42) and arrow function definitions + * (f : x y -> x + y). Also handles when expressions as assignment values. + * + * The function uses lookahead to distinguish between different assignment + * types and parses the value according to the detected type. + */ + function parseAssignment() { + const identifier = tokens[current].value; + current++; // Skip identifier + current++; // Skip assignment token (:) + + // Check if the value is a when expression + if (tokens[current].type === TokenType.WHEN) { + const value = parseWhenExpression(); + + // Expect semicolon + if (current < tokens.length && tokens[current].type === TokenType.SEMICOLON) { + current++; + } + + return { + type: 'Assignment', + identifier, + value + }; + } else { + // Check if this is an arrow function: param1 param2 -> body + const params = []; + let isArrowFunction = false; + + // Look ahead to see if this is an arrow function + let lookAhead = current; + while (lookAhead < tokens.length && tokens[lookAhead].type === TokenType.IDENTIFIER) { + lookAhead++; + } + + if (lookAhead < tokens.length && tokens[lookAhead].type === TokenType.ARROW) { + // This is an arrow function + isArrowFunction = true; + + // Parse parameters + while (current < tokens.length && tokens[current].type === TokenType.IDENTIFIER) { + params.push(tokens[current].value); + current++; + } + + if (current >= tokens.length || tokens[current].type !== TokenType.ARROW) { + throw new Error('Expected "->" after parameters in arrow function'); + } + current++; // Skip '->' + + // Check if the body is a when expression + let body; + if (tokens[current].type === TokenType.WHEN) { + body = parseWhenExpression(); + } else { + body = parseLogicalExpression(); + } + + // Expect semicolon + if (current < tokens.length && tokens[current].type === TokenType.SEMICOLON) { + current++; + } + + return { + type: 'Assignment', + identifier, + value: { + type: 'FunctionDeclaration', + params, + body + } + }; + } else { + // Parse the value as an expression (function calls will be handled by expression parsing) + const value = parseLogicalExpression(); + + // Expect semicolon + if (current < tokens.length && tokens[current].type === TokenType.SEMICOLON) { + current++; + } + + return { + type: 'Assignment', + identifier, + value + }; + } + } + } + + /** + * Parse when expressions: when value is pattern then result pattern then result; + * + * @returns {Object} WhenExpression AST node + * @throws {Error} For malformed when expressions + * @description Parses pattern matching expressions with support for single + * and multiple values/patterns. The when expression is the primary pattern + * matching construct in the language. + * + * Supports: + * - Single value patterns: when x is 42 then "correct" _ then "wrong" + * - Multiple value patterns: when x y is 0 0 then "both zero" _ _ then "not both" + * - Wildcard patterns: _ (matches any value) + * - Function references: @functionName + * + * The function parses values, patterns, and results, building a structured + * AST that the interpreter can efficiently evaluate. + */ + function parseWhenExpression() { + current++; // Skip 'when' + + // Parse the value(s) - can be single value or multiple values + const values = []; + while (current < tokens.length && tokens[current].type !== TokenType.IS) { + // For when expressions, we want to parse simple identifiers and expressions + // but not treat them as function calls + let value; + if (tokens[current].type === TokenType.IDENTIFIER) { + // Check if this is followed by another identifier (multi-value case) + if (current + 1 < tokens.length && + tokens[current + 1].type === TokenType.IDENTIFIER && + tokens[current + 2].type === TokenType.IS) { + // This is a multi-value case like "when x y is" + value = { type: 'Identifier', value: tokens[current].value }; + current++; + values.push(value); + value = { type: 'Identifier', value: tokens[current].value }; + current++; + values.push(value); + break; // We've consumed both values and will hit IS next + } else { + // Single identifier value + value = { type: 'Identifier', value: tokens[current].value }; + current++; + } + } else { + // For other types, use normal expression parsing + value = parseLogicalExpression(); + } + values.push(value); + } + + if (current >= tokens.length || tokens[current].type !== TokenType.IS) { + throw new Error('Expected "is" after value in when expression'); + } + current++; // Skip 'is' + + const cases = []; + + while (current < tokens.length) { + // Parse pattern(s) - can be single pattern or multiple patterns + const patterns = []; + + // Parse patterns until we hit THEN + while (current < tokens.length && tokens[current].type !== TokenType.THEN) { + let pattern; + if (process.env.DEBUG) { + console.log(`[DEBUG] parseWhenExpression: parsing pattern, current token = ${tokens[current].type}, value = ${tokens[current].value || 'N/A'}`); + } + if (tokens[current].type === TokenType.IDENTIFIER) { + pattern = { type: 'Identifier', value: tokens[current].value }; + current++; + } else if (tokens[current].type === TokenType.NUMBER) { + pattern = { type: 'NumberLiteral', value: tokens[current].value }; + current++; + } else if (tokens[current].type === TokenType.STRING) { + pattern = { type: 'StringLiteral', value: tokens[current].value }; + current++; + } else if (tokens[current].type === TokenType.WILDCARD) { + pattern = { type: 'WildcardPattern' }; + current++; + } else if (tokens[current].type === TokenType.FUNCTION_REF) { + pattern = { type: 'FunctionReference', name: tokens[current].name }; + current++; + } else { + throw new Error(`Expected pattern (identifier, number, string, wildcard, or function reference) in when expression, got ${tokens[current].type}`); + } + patterns.push(pattern); + } + + if (current >= tokens.length || tokens[current].type !== TokenType.THEN) { + throw new Error('Expected "then" after pattern in when expression'); + } + current++; // Skip 'then' + + // Parse result + const result = parseLogicalExpression(); + + cases.push({ + pattern: patterns, + result: [result] + }); + + // Stop parsing cases when we hit a semicolon + if (current < tokens.length && tokens[current].type === TokenType.SEMICOLON) { + current++; + break; + } else { + // No semicolon, but check if next token is a valid pattern + if ( + current >= tokens.length || + (tokens[current].type !== TokenType.IDENTIFIER && + tokens[current].type !== TokenType.NUMBER && + tokens[current].type !== TokenType.STRING && + tokens[current].type !== TokenType.WILDCARD && + tokens[current].type !== TokenType.FUNCTION_REF) + ) { + break; + } + } + } + + return { + type: 'WhenExpression', + value: values.length === 1 ? values[0] : values, + cases + }; + } + + /** + * Parse function definitions: function (params) : body + * + * @returns {Object} FunctionDefinition AST node + * @throws {Error} For malformed function definitions + * @description Parses explicit function declarations with parameter lists + * and function bodies. This is the traditional function definition syntax + * as opposed to arrow functions. + * + * The function expects: + * - function keyword + * - Parenthesized parameter list + * - Assignment token (:) + * - Function body expression + */ + function parseFunctionDefinition() { + current++; // Skip 'function' + + if (current >= tokens.length || tokens[current].type !== TokenType.LEFT_PAREN) { + throw new Error('Expected "(" after function keyword'); + } + current++; // Skip '(' + + const parameters = []; + while (current < tokens.length && tokens[current].type !== TokenType.RIGHT_PAREN) { + if (tokens[current].type === TokenType.IDENTIFIER) { + parameters.push(tokens[current].value); + current++; + + if (current < tokens.length && tokens[current].type === TokenType.COMMA) { + current++; // Skip comma + } + } else { + throw new Error('Expected parameter name in function definition'); + } + } + + if (current >= tokens.length || tokens[current].type !== TokenType.RIGHT_PAREN) { + throw new Error('Expected ")" after function parameters'); + } + current++; // Skip ')' + + if (current >= tokens.length || tokens[current].type !== TokenType.ASSIGNMENT) { + throw new Error('Expected ":" after function parameters'); + } + current++; // Skip ':' + + const body = parseLogicalExpression(); + + return { + type: 'FunctionDefinition', + parameters, + body + }; + } + + /** + * Parse IO input operations: ..in + * + * @returns {Object} IOInExpression AST node + * @description Parses input operations that read from standard input. + * The operation is represented as a simple AST node that the interpreter + * will handle by prompting for user input. + */ + function parseIOIn() { + current++; // Skip IO_IN token + return { type: 'IOInExpression' }; + } + + /** + * Parse IO output operations: ..out expression + * + * @returns {Object} IOOutExpression AST node + * @throws {Error} For malformed output expressions + * @description Parses output operations that write to standard output. + * The expression to output is parsed as a logical expression and will + * be evaluated by the interpreter before being printed. + */ + function parseIOOut() { + current++; // Skip IO_OUT token + const value = parseLogicalExpression(); + + // Expect semicolon + if (current < tokens.length && tokens[current].type === TokenType.SEMICOLON) { + current++; + } + + return { + type: 'IOOutExpression', + value + }; + } + + /** + * Parse IO assert operations: ..assert expression + * + * @returns {Object} IOAssertExpression AST node + * @throws {Error} For malformed assert expressions + * @description Parses assertion operations that verify conditions. + * The expression is parsed as a logical expression and will be evaluated + * by the interpreter. If the result is falsy, an assertion error is thrown. + */ + function parseIOAssert() { + current++; // Skip IO_ASSERT token + const value = parseLogicalExpression(); + + // Expect semicolon + if (current < tokens.length && tokens[current].type === TokenType.SEMICOLON) { + current++; + } + + return { + type: 'IOAssertExpression', + value + }; + } + + /** + * Parse logical expressions with proper precedence + * + * @returns {Object} AST node representing the logical expression + * @description Parses logical expressions (and, or, xor) with the lowest + * precedence. All logical operators are translated to FunctionCall nodes + * using the corresponding combinator functions. + * + * Operator precedence (lowest to highest): + * 1. Logical operators (and, or, xor) + * 2. Comparison operators (=, !=, <, >, <=, >=) + * 3. Additive operators (+, -) + * 4. Multiplicative operators (*, /, %) + * 5. Power operator (^) + * 6. Unary operators (not, -) + * 7. Primary expressions (literals, identifiers, function calls, parentheses) + */ + function parseLogicalExpression() { + let left = parseExpression(); + + while (current < tokens.length) { + const token = tokens[current]; + + if (token.type === TokenType.AND || + token.type === TokenType.OR || + token.type === TokenType.XOR) { + current++; + const right = parseExpression(); + left = { + type: 'FunctionCall', + name: token.type === TokenType.AND ? 'logicalAnd' : + token.type === TokenType.OR ? 'logicalOr' : 'logicalXor', + args: [left, right] + }; + } else { + break; + } + } + + return left; + } + + /** + * Parse comparison expressions + * + * @returns {Object} AST node representing the comparison expression + * @description Parses comparison expressions (=, !=, <, >, <=, >=) and + * additive expressions (+, -). All operators are translated to FunctionCall + * nodes using the corresponding combinator functions. + * + * This function implements the core of the combinator-based architecture + * by translating operator expressions to function calls that will be + * executed by the interpreter using standard library combinators. + */ + function parseExpression() { + let left = parseTerm(); + + while (current < tokens.length) { + const token = tokens[current]; + + if (process.env.DEBUG) { + console.log(`[DEBUG] parseExpression: current token = ${token.type}, value = ${token.value || 'N/A'}`); + } + + if (token.type === TokenType.PLUS || token.type === TokenType.MINUS) { + current++; + const right = parseTerm(); + left = { + type: 'FunctionCall', + name: token.type === TokenType.PLUS ? 'add' : 'subtract', + args: [left, right] + }; + } else if (token.type === TokenType.EQUALS || + token.type === TokenType.NOT_EQUAL || + token.type === TokenType.LESS_THAN || + token.type === TokenType.GREATER_THAN || + token.type === TokenType.LESS_EQUAL || + token.type === TokenType.GREATER_EQUAL) { + current++; + const right = parseTerm(); + left = { + type: 'FunctionCall', + name: token.type === TokenType.EQUALS ? 'equals' : + token.type === TokenType.NOT_EQUAL ? 'notEquals' : + token.type === TokenType.LESS_THAN ? 'lessThan' : + token.type === TokenType.GREATER_THAN ? 'greaterThan' : + token.type === TokenType.LESS_EQUAL ? 'lessEqual' : 'greaterEqual', + args: [left, right] + }; + } else { + break; + } + } + + return left; + } + + /** + * Parse multiplication and division expressions + * + * @returns {Object} AST node representing the multiplicative expression + * @description Parses multiplicative expressions (*, /, %) with higher + * precedence than additive expressions. All operators are translated to + * FunctionCall nodes using the corresponding combinator functions. + */ + function parseTerm() { + let left = parseFactor(); + + while (current < tokens.length) { + const token = tokens[current]; + + if (token.type === TokenType.MULTIPLY || + token.type === TokenType.DIVIDE || + token.type === TokenType.MODULO) { + current++; + const right = parseFactor(); + left = { + type: 'FunctionCall', + name: token.type === TokenType.MULTIPLY ? 'multiply' : + token.type === TokenType.DIVIDE ? 'divide' : 'modulo', + args: [left, right] + }; + } else { + break; + } + } + + return left; + } + + /** + * Parse power expressions and unary operators + * + * @returns {Object} AST node representing the factor expression + * @description Parses power expressions (^) and unary operators (not, -) + * with the highest precedence among operators. All operators are translated + * to FunctionCall nodes using the corresponding combinator functions. + */ + function parseFactor() { + let left = parsePrimary(); + + while (current < tokens.length) { + const token = tokens[current]; + + if (token.type === TokenType.POWER) { + current++; + const right = parsePrimary(); + left = { + type: 'FunctionCall', + name: 'power', + args: [left, right] + }; + } else { + break; + } + } + + return left; + } + + /** + * Parse table literals: {key: value, key2: value2} or {value1, value2, value3} + * + * @returns {Object} TableLiteral AST node + * @throws {Error} For malformed table literals + * @description Parses table literals with support for both key-value pairs + * and array-like entries. Tables are the primary data structure in the language. + * + * Supports: + * - Key-value pairs: {name: "Alice", age: 30} + * - Array-like entries: {1, 2, 3} + * - Mixed entries: {1, 2, name: "Alice", 3} + * + * Array-like entries are automatically assigned numeric keys starting from 1. + */ + function parseTableLiteral() { + current++; // Skip '{' + + const entries = []; + + while (current < tokens.length && tokens[current].type !== TokenType.RIGHT_BRACE) { + // Check if this is a key-value pair or just a value + let key = null; + let value; + + // Parse the first element + if (tokens[current].type === TokenType.IDENTIFIER) { + // Could be a key or a value + const identifier = tokens[current].value; + current++; + + if (current < tokens.length && tokens[current].type === TokenType.ASSIGNMENT) { + // This is a key-value pair: key : value + key = { type: 'Identifier', value: identifier }; + current++; // Skip ':' + value = parseLogicalExpression(); + } else { + // This is just a value (array-like entry) + value = { type: 'Identifier', value: identifier }; + } + } else { + // This is a value (array-like entry) + value = parseLogicalExpression(); + } + + entries.push({ key, value }); + + // Skip comma if present + if (current < tokens.length && tokens[current].type === TokenType.COMMA) { + current++; + } + } + + if (current >= tokens.length || tokens[current].type !== TokenType.RIGHT_BRACE) { + throw new Error('Expected "}" after table literal'); + } + current++; // Skip '}' + + return { + type: 'TableLiteral', + entries + }; + } + + + + /** + * Parse function calls: functionName arg1 arg2 ... + * + * @returns {Object} FunctionCall AST node + * @description Parses function calls with multiple arguments. This function + * is used by parsePrimary to detect when an identifier is followed by + * expressions that should be treated as function arguments. + * + * Function calls are detected by the presence of an identifier followed + * by expressions that are not operators. The parser uses lookahead to + * determine if an identifier should be treated as a function call. + */ + function parseFunctionCall() { + const functionName = tokens[current].value; + current++; // Skip function name + + // Parse arguments until we hit a semicolon or end of tokens + const args = []; + while (current < tokens.length && tokens[current].type !== TokenType.SEMICOLON) { + const arg = parseLogicalExpression(); + args.push(arg); + } + + return { + type: 'FunctionCall', + name: functionName, + args + }; + } + + /** + * Parse primary expressions (literals, identifiers, parenthesized expressions) + * + * @returns {Object} AST node representing the primary expression + * @throws {Error} For unexpected tokens or malformed expressions + * @description Parses the highest precedence expressions including literals, + * identifiers, function calls, table access, and parenthesized expressions. + * This is the foundation of the expression parsing hierarchy. + * + * The function implements sophisticated function call detection by looking + * for identifiers followed by expressions that could be arguments. This + * approach allows the language to support both traditional function calls + * and the ML-style function application syntax. + * + * Supports: + * - Literals: numbers, strings, booleans + * - Identifiers: variables and function names + * - Function calls: f(x, y) or f x y + * - Table access: table[key] or table.property + * - Parenthesized expressions: (x + y) + * - Unary operators: not x, -x + * - Function references: @functionName + */ + function parsePrimary() { + const token = tokens[current]; + + if (!token) { + throw new Error('Unexpected end of input'); + } + + if (process.env.DEBUG) { + console.log(`[DEBUG] parsePrimary: current token = ${token.type}, value = ${token.value || 'N/A'}`); + } + + switch (token.type) { + case TokenType.NUMBER: + current++; + return { type: 'NumberLiteral', value: token.value }; + + case TokenType.STRING: + current++; + return { type: 'StringLiteral', value: token.value }; + + case TokenType.TRUE: + current++; + return { type: 'BooleanLiteral', value: true }; + + case TokenType.FALSE: + current++; + return { type: 'BooleanLiteral', value: false }; + + case TokenType.IDENTIFIER: + const identifierValue = token.value; + current++; + + // Check for table access: identifier[key] or identifier.property + if (current < tokens.length && tokens[current].type === TokenType.LEFT_BRACKET) { + current++; // Skip '[' + const keyExpression = parseLogicalExpression(); + + if (current >= tokens.length || tokens[current].type !== TokenType.RIGHT_BRACKET) { + throw new Error('Expected "]" after table key'); + } + current++; // Skip ']' + + return { + type: 'TableAccess', + table: { type: 'Identifier', value: identifierValue }, + key: keyExpression + }; + } else if (current < tokens.length && tokens[current].type === TokenType.DOT) { + current++; // Skip '.' + + if (current >= tokens.length || tokens[current].type !== TokenType.IDENTIFIER) { + throw new Error('Expected identifier after "." in table access'); + } + + const propertyName = tokens[current].value; + current++; // Skip property name + + return { + type: 'TableAccess', + table: { type: 'Identifier', value: identifierValue }, + key: { type: 'Identifier', value: propertyName } + }; + } + + // Parse function call arguments (including parenthesized expressions) + const args = []; + while ( + current < tokens.length && + ( + tokens[current].type === TokenType.IDENTIFIER || + tokens[current].type === TokenType.NUMBER || + tokens[current].type === TokenType.STRING || + tokens[current].type === TokenType.LEFT_PAREN || + tokens[current].type === TokenType.LEFT_BRACE || + tokens[current].type === TokenType.TRUE || + tokens[current].type === TokenType.FALSE || + tokens[current].type === TokenType.FUNCTION_REF || + (tokens[current].type === TokenType.MINUS && + current + 1 < tokens.length && + tokens[current + 1].type === TokenType.NUMBER) + ) + ) { + // Special case: if we see FUNCTION_REF followed by MINUS followed by NUMBER, + // parse them as separate arguments + if (tokens[current].type === TokenType.FUNCTION_REF && + current + 1 < tokens.length && tokens[current + 1].type === TokenType.MINUS && + current + 2 < tokens.length && tokens[current + 2].type === TokenType.NUMBER) { + // Parse the function reference + args.push(parsePrimary()); + // Parse the unary minus as a separate argument + args.push(parsePrimary()); + } else { + // Parse each argument as a complete expression + args.push(parseExpression()); + } + } + if (args.length > 0) { + return { + type: 'FunctionCall', + name: identifierValue, + args + }; + } + return { type: 'Identifier', value: identifierValue }; + + case TokenType.LEFT_PAREN: + current++; + if (process.env.DEBUG) { + console.log(`[DEBUG] parsePrimary: parsing LEFT_PAREN, current token = ${tokens[current].type}`); + } + const expression = parseLogicalExpression(); + if (current >= tokens.length || tokens[current].type !== TokenType.RIGHT_PAREN) { + throw new Error('Expected ")" after expression'); + } + current++; + return expression; + + case TokenType.WILDCARD: + current++; + return { type: 'WildcardPattern' }; + + case TokenType.LEFT_BRACE: + return parseTableLiteral(); + + + + case TokenType.NOT: + current++; + const operand = parsePrimary(); + return { + type: 'FunctionCall', + name: 'logicalNot', + args: [operand] + }; + + case TokenType.MINUS: + current++; + const unaryOperand = parsePrimary(); + return { + type: 'FunctionCall', + name: 'negate', + args: [unaryOperand] + }; + + case TokenType.ARROW: + current++; + const arrowBody = parseLogicalExpression(); + return { type: 'ArrowExpression', body: arrowBody }; + + case TokenType.FUNCTION_REF: + const functionRef = { type: 'FunctionReference', name: token.name }; + current++; + return functionRef; + + default: + throw new Error(`Unexpected token in parsePrimary: ${token.type}`); + } + } + + return parse(); +} \ No newline at end of file |