diff options
Diffstat (limited to 'js/scripting-lang/lexer.js')
-rw-r--r-- | js/scripting-lang/lexer.js | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/js/scripting-lang/lexer.js b/js/scripting-lang/lexer.js new file mode 100644 index 0000000..de87ac7 --- /dev/null +++ b/js/scripting-lang/lexer.js @@ -0,0 +1,397 @@ +// Lexer for the scripting language +// Supports both Node.js and browser environments + +/** + * Token types for the language + * + * @description Defines all token types used by the lexer and parser. + * Each token type represents a distinct syntactic element in the language. + * + * The token types are organized into categories: + * - Literals: NUMBER, STRING, TRUE, FALSE + * - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc. + * - Keywords: WHEN, IS, THEN, FUNCTION, etc. + * - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc. + * - Special: IO_IN, IO_OUT, IO_ASSERT, FUNCTION_REF + * + * This enumeration provides a centralized definition of all possible + * token types, ensuring consistency between lexer and parser. + */ +export const TokenType = { + NUMBER: 'NUMBER', + PLUS: 'PLUS', + MINUS: 'MINUS', + MULTIPLY: 'MULTIPLY', + DIVIDE: 'DIVIDE', + IDENTIFIER: 'IDENTIFIER', + ASSIGNMENT: 'ASSIGNMENT', + ARROW: 'ARROW', + CASE: 'CASE', + OF: 'OF', + WHEN: 'WHEN', + IS: 'IS', + THEN: 'THEN', + WILDCARD: 'WILDCARD', + FUNCTION: 'FUNCTION', + LEFT_PAREN: 'LEFT_PAREN', + RIGHT_PAREN: 'RIGHT_PAREN', + LEFT_BRACE: 'LEFT_BRACE', + RIGHT_BRACE: 'RIGHT_BRACE', + LEFT_BRACKET: 'LEFT_BRACKET', + RIGHT_BRACKET: 'RIGHT_BRACKET', + SEMICOLON: 'SEMICOLON', + COMMA: 'COMMA', + DOT: 'DOT', + STRING: 'STRING', + TRUE: 'TRUE', + FALSE: 'FALSE', + AND: 'AND', + OR: 'OR', + XOR: 'XOR', + NOT: 'NOT', + EQUALS: 'EQUALS', + LESS_THAN: 'LESS_THAN', + GREATER_THAN: 'GREATER_THAN', + LESS_EQUAL: 'LESS_EQUAL', + GREATER_EQUAL: 'GREATER_EQUAL', + NOT_EQUAL: 'NOT_EQUAL', + MODULO: 'MODULO', + POWER: 'POWER', + IO_IN: 'IO_IN', + IO_OUT: 'IO_OUT', + IO_ASSERT: 'IO_ASSERT', + FUNCTION_REF: 'FUNCTION_REF' +}; + +/** + * Converts source code into tokens + * + * @param {string} input - The source code to tokenize + * @returns {Array.<Object>} Array of token objects with type, value, line, and column + * @throws {Error} For unexpected characters or malformed tokens + * + * @description The lexer performs lexical analysis by converting source code + * into a stream of tokens. Each token represents a meaningful unit of the + * language syntax, such as identifiers, literals, operators, and keywords. + * + * The lexer implements a character-by-character scanning approach with + * lookahead for multi-character tokens. It maintains line and column + * information for accurate error reporting and debugging. + * + * Key features: + * - Handles whitespace and comments (single-line and multi-line) + * - Recognizes all language constructs including operators, keywords, and literals + * - Supports string literals with escape sequences + * - Provides detailed position information for error reporting + * - Cross-platform compatibility (Node.js, Bun, browser) + * + * The lexer is designed to be robust and provide clear error messages + * for malformed input, making it easier to debug syntax errors in user code. + */ +export function lexer(input) { + const tokens = []; + let current = 0; + let line = 1; + let column = 1; + + while (current < input.length) { + let char = input[current]; + + // Skip whitespace + if (/\s/.test(char)) { + if (char === '\n') { + line++; + column = 1; + } else { + column++; + } + current++; + continue; + } + + // Skip comments (single line and multi-line) + if (char === '/' && input[current + 1] === '/') { + while (current < input.length && input[current] !== '\n') { + current++; + column++; + } + continue; + } + + // Skip multi-line comments /* ... */ + if (char === '/' && input[current + 1] === '*') { + current += 2; // Skip /* + column += 2; + while (current < input.length - 1 && !(input[current] === '*' && input[current + 1] === '/')) { + if (input[current] === '\n') { + line++; + column = 1; + } else { + column++; + } + current++; + } + if (current < input.length - 1) { + current += 2; // Skip */ + column += 2; + } + continue; + } + + // IO operations (..in, ..out, ..assert) + if (char === '.' && input[current + 1] === '.') { + current += 2; // Skip both dots + column += 2; + + // Read the IO operation name + let operation = ''; + while (current < input.length && /[a-zA-Z]/.test(input[current])) { + operation += input[current]; + current++; + column++; + } + + // Determine the IO operation type + switch (operation) { + case 'in': + tokens.push({ type: TokenType.IO_IN, line, column: column - operation.length - 2 }); + break; + case 'out': + tokens.push({ type: TokenType.IO_OUT, line, column: column - operation.length - 2 }); + break; + case 'assert': + tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 }); + break; + default: + throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`); + } + continue; + } + + // Function references (@function) + if (char === '@') { + current++; // Skip '@' + column++; + + // Read the function name + let functionName = ''; + while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) { + functionName += input[current]; + current++; + column++; + } + + if (functionName === '') { + throw new Error(`Invalid function reference at line ${line}, column ${column - 1}`); + } + + tokens.push({ type: TokenType.FUNCTION_REF, name: functionName, line, column: column - functionName.length - 1 }); + continue; + } + + // Numbers + if (/[0-9]/.test(char)) { + let value = ''; + while (current < input.length && /[0-9.]/.test(input[current])) { + value += input[current]; + current++; + column++; + } + tokens.push({ type: TokenType.NUMBER, value: parseFloat(value), line, column: column - value.length }); + continue; + } + + // Identifiers and keywords + if (/[a-zA-Z_]/.test(char)) { + let value = ''; + const startColumn = column; + while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) { + value += input[current]; + current++; + column++; + } + + // Check for keywords + switch (value) { + case 'true': + tokens.push({ type: TokenType.TRUE, value: true, line, column: startColumn }); + break; + case 'false': + tokens.push({ type: TokenType.FALSE, value: false, line, column: startColumn }); + break; + case 'and': + tokens.push({ type: TokenType.AND, line, column: startColumn }); + break; + case 'or': + tokens.push({ type: TokenType.OR, line, column: startColumn }); + break; + case 'xor': + tokens.push({ type: TokenType.XOR, line, column: startColumn }); + break; + case 'not': + tokens.push({ type: TokenType.NOT, line, column: startColumn }); + break; + case 'case': + tokens.push({ type: TokenType.CASE, line, column: startColumn }); + break; + case 'of': + tokens.push({ type: TokenType.OF, line, column: startColumn }); + break; + case 'when': + tokens.push({ type: TokenType.WHEN, line, column: startColumn }); + break; + case 'is': + tokens.push({ type: TokenType.IS, line, column: startColumn }); + break; + case 'then': + tokens.push({ type: TokenType.THEN, line, column: startColumn }); + break; + case 'function': + tokens.push({ type: TokenType.FUNCTION, line, column: startColumn }); + break; + case '_': + tokens.push({ type: TokenType.WILDCARD, line, column: startColumn }); + break; + default: + tokens.push({ type: TokenType.IDENTIFIER, value, line, column: startColumn }); + } + continue; + } + + // Strings + if (char === '"') { + let value = ''; + current++; + column++; + while (current < input.length && input[current] !== '"') { + if (input[current] === '\\') { + current++; + column++; + if (current < input.length) { + switch (input[current]) { + case 'n': value += '\n'; break; + case 't': value += '\t'; break; + case 'r': value += '\r'; break; + case '\\': value += '\\'; break; + case '"': value += '"'; break; + default: value += input[current]; + } + } + } else { + value += input[current]; + } + current++; + column++; + } + if (current < input.length) { + current++; + column++; + } + tokens.push({ type: TokenType.STRING, value, line, column: column - value.length - 2 }); + continue; + } + + // Operators and punctuation + switch (char) { + case '+': + tokens.push({ type: TokenType.PLUS, line, column }); + break; + case '-': + if (input[current + 1] === '>') { + tokens.push({ type: TokenType.ARROW, line, column }); + current++; + column++; + } else { + tokens.push({ type: TokenType.MINUS, line, column }); + } + break; + case '*': + tokens.push({ type: TokenType.MULTIPLY, line, column }); + break; + case '/': + tokens.push({ type: TokenType.DIVIDE, line, column }); + break; + case '%': + tokens.push({ type: TokenType.MODULO, line, column }); + break; + case '^': + tokens.push({ type: TokenType.POWER, line, column }); + break; + case '(': + tokens.push({ type: TokenType.LEFT_PAREN, line, column }); + break; + case ')': + tokens.push({ type: TokenType.RIGHT_PAREN, line, column }); + break; + case '{': + tokens.push({ type: TokenType.LEFT_BRACE, line, column }); + break; + case '}': + tokens.push({ type: TokenType.RIGHT_BRACE, line, column }); + break; + case '[': + tokens.push({ type: TokenType.LEFT_BRACKET, line, column }); + break; + case ']': + tokens.push({ type: TokenType.RIGHT_BRACKET, line, column }); + break; + case ';': + tokens.push({ type: TokenType.SEMICOLON, line, column }); + break; + case ',': + tokens.push({ type: TokenType.COMMA, line, column }); + break; + case '.': + tokens.push({ type: TokenType.DOT, line, column }); + break; + case ':': + tokens.push({ type: TokenType.ASSIGNMENT, line, column }); + break; + + case '=': + if (input[current + 1] === '=') { + tokens.push({ type: TokenType.EQUALS, line, column }); + current++; + column++; + } else { + // Single = is used for equality comparison in assertions + tokens.push({ type: TokenType.EQUALS, line, column }); + } + break; + case '<': + if (input[current + 1] === '=') { + tokens.push({ type: TokenType.LESS_EQUAL, line, column }); + current++; + column++; + } else { + tokens.push({ type: TokenType.LESS_THAN, line, column }); + } + break; + case '>': + if (input[current + 1] === '=') { + tokens.push({ type: TokenType.GREATER_EQUAL, line, column }); + current++; + column++; + } else { + tokens.push({ type: TokenType.GREATER_THAN, line, column }); + } + break; + case '!': + if (input[current + 1] === '=') { + tokens.push({ type: TokenType.NOT_EQUAL, line, column }); + current++; + column++; + } else { + throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`); + } + break; + default: + throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`); + } + + current++; + column++; + } + + return tokens; +} \ No newline at end of file |