about summary refs log tree commit diff stats
path: root/js/scripting-lang/lexer.js
diff options
context:
space:
mode:
Diffstat (limited to 'js/scripting-lang/lexer.js')
-rw-r--r--js/scripting-lang/lexer.js397
1 files changed, 397 insertions, 0 deletions
diff --git a/js/scripting-lang/lexer.js b/js/scripting-lang/lexer.js
new file mode 100644
index 0000000..de87ac7
--- /dev/null
+++ b/js/scripting-lang/lexer.js
@@ -0,0 +1,397 @@
+// Lexer for the scripting language
+// Supports both Node.js and browser environments
+
+/**
+ * Token types for the language
+ * 
+ * @description Defines all token types used by the lexer and parser.
+ * Each token type represents a distinct syntactic element in the language.
+ * 
+ * The token types are organized into categories:
+ * - Literals: NUMBER, STRING, TRUE, FALSE
+ * - Operators: PLUS, MINUS, MULTIPLY, DIVIDE, MODULO, POWER, etc.
+ * - Keywords: WHEN, IS, THEN, FUNCTION, etc.
+ * - Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMICOLON, COMMA, etc.
+ * - Special: IO_IN, IO_OUT, IO_ASSERT, FUNCTION_REF
+ * 
+ * This enumeration provides a centralized definition of all possible
+ * token types, ensuring consistency between lexer and parser.
+ */
+export const TokenType = {
+    NUMBER: 'NUMBER',
+    PLUS: 'PLUS',
+    MINUS: 'MINUS',
+    MULTIPLY: 'MULTIPLY',
+    DIVIDE: 'DIVIDE',
+    IDENTIFIER: 'IDENTIFIER',
+    ASSIGNMENT: 'ASSIGNMENT',
+    ARROW: 'ARROW',
+    CASE: 'CASE',
+    OF: 'OF',
+    WHEN: 'WHEN',
+    IS: 'IS',
+    THEN: 'THEN',
+    WILDCARD: 'WILDCARD',
+    FUNCTION: 'FUNCTION',
+    LEFT_PAREN: 'LEFT_PAREN',
+    RIGHT_PAREN: 'RIGHT_PAREN',
+    LEFT_BRACE: 'LEFT_BRACE',
+    RIGHT_BRACE: 'RIGHT_BRACE',
+    LEFT_BRACKET: 'LEFT_BRACKET',
+    RIGHT_BRACKET: 'RIGHT_BRACKET',
+    SEMICOLON: 'SEMICOLON',
+    COMMA: 'COMMA',
+    DOT: 'DOT',
+    STRING: 'STRING',
+    TRUE: 'TRUE',
+    FALSE: 'FALSE',
+    AND: 'AND',
+    OR: 'OR',
+    XOR: 'XOR',
+    NOT: 'NOT',
+    EQUALS: 'EQUALS',
+    LESS_THAN: 'LESS_THAN',
+    GREATER_THAN: 'GREATER_THAN',
+    LESS_EQUAL: 'LESS_EQUAL',
+    GREATER_EQUAL: 'GREATER_EQUAL',
+    NOT_EQUAL: 'NOT_EQUAL',
+    MODULO: 'MODULO',
+    POWER: 'POWER',
+    IO_IN: 'IO_IN',
+    IO_OUT: 'IO_OUT',
+    IO_ASSERT: 'IO_ASSERT',
+    FUNCTION_REF: 'FUNCTION_REF'
+};
+
+/**
+ * Converts source code into tokens
+ * 
+ * @param {string} input - The source code to tokenize
+ * @returns {Array.<Object>} Array of token objects with type, value, line, and column
+ * @throws {Error} For unexpected characters or malformed tokens
+ * 
+ * @description The lexer performs lexical analysis by converting source code
+ * into a stream of tokens. Each token represents a meaningful unit of the
+ * language syntax, such as identifiers, literals, operators, and keywords.
+ * 
+ * The lexer implements a character-by-character scanning approach with
+ * lookahead for multi-character tokens. It maintains line and column
+ * information for accurate error reporting and debugging.
+ * 
+ * Key features:
+ * - Handles whitespace and comments (single-line and multi-line)
+ * - Recognizes all language constructs including operators, keywords, and literals
+ * - Supports string literals with escape sequences
+ * - Provides detailed position information for error reporting
+ * - Cross-platform compatibility (Node.js, Bun, browser)
+ * 
+ * The lexer is designed to be robust and provide clear error messages
+ * for malformed input, making it easier to debug syntax errors in user code.
+ */
+export function lexer(input) {
+    const tokens = [];
+    let current = 0;
+    let line = 1;
+    let column = 1;
+
+    while (current < input.length) {
+        let char = input[current];
+
+        // Skip whitespace
+        if (/\s/.test(char)) {
+            if (char === '\n') {
+                line++;
+                column = 1;
+            } else {
+                column++;
+            }
+            current++;
+            continue;
+        }
+
+        // Skip comments (single line and multi-line)
+        if (char === '/' && input[current + 1] === '/') {
+            while (current < input.length && input[current] !== '\n') {
+                current++;
+                column++;
+            }
+            continue;
+        }
+        
+        // Skip multi-line comments /* ... */
+        if (char === '/' && input[current + 1] === '*') {
+            current += 2; // Skip /*
+            column += 2;
+            while (current < input.length - 1 && !(input[current] === '*' && input[current + 1] === '/')) {
+                if (input[current] === '\n') {
+                    line++;
+                    column = 1;
+                } else {
+                    column++;
+                }
+                current++;
+            }
+            if (current < input.length - 1) {
+                current += 2; // Skip */
+                column += 2;
+            }
+            continue;
+        }
+
+        // IO operations (..in, ..out, ..assert)
+        if (char === '.' && input[current + 1] === '.') {
+            current += 2; // Skip both dots
+            column += 2;
+            
+            // Read the IO operation name
+            let operation = '';
+            while (current < input.length && /[a-zA-Z]/.test(input[current])) {
+                operation += input[current];
+                current++;
+                column++;
+            }
+            
+            // Determine the IO operation type
+            switch (operation) {
+                case 'in':
+                    tokens.push({ type: TokenType.IO_IN, line, column: column - operation.length - 2 });
+                    break;
+                case 'out':
+                    tokens.push({ type: TokenType.IO_OUT, line, column: column - operation.length - 2 });
+                    break;
+                case 'assert':
+                    tokens.push({ type: TokenType.IO_ASSERT, line, column: column - operation.length - 2 });
+                    break;
+                default:
+                    throw new Error(`Unknown IO operation: ..${operation} at line ${line}, column ${column - operation.length - 2}`);
+            }
+            continue;
+        }
+        
+        // Function references (@function)
+        if (char === '@') {
+            current++; // Skip '@'
+            column++;
+            
+            // Read the function name
+            let functionName = '';
+            while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) {
+                functionName += input[current];
+                current++;
+                column++;
+            }
+            
+            if (functionName === '') {
+                throw new Error(`Invalid function reference at line ${line}, column ${column - 1}`);
+            }
+            
+            tokens.push({ type: TokenType.FUNCTION_REF, name: functionName, line, column: column - functionName.length - 1 });
+            continue;
+        }
+
+        // Numbers
+        if (/[0-9]/.test(char)) {
+            let value = '';
+            while (current < input.length && /[0-9.]/.test(input[current])) {
+                value += input[current];
+                current++;
+                column++;
+            }
+            tokens.push({ type: TokenType.NUMBER, value: parseFloat(value), line, column: column - value.length });
+            continue;
+        }
+
+        // Identifiers and keywords
+        if (/[a-zA-Z_]/.test(char)) {
+            let value = '';
+            const startColumn = column;
+            while (current < input.length && /[a-zA-Z0-9_]/.test(input[current])) {
+                value += input[current];
+                current++;
+                column++;
+            }
+
+            // Check for keywords
+            switch (value) {
+                case 'true':
+                    tokens.push({ type: TokenType.TRUE, value: true, line, column: startColumn });
+                    break;
+                case 'false':
+                    tokens.push({ type: TokenType.FALSE, value: false, line, column: startColumn });
+                    break;
+                case 'and':
+                    tokens.push({ type: TokenType.AND, line, column: startColumn });
+                    break;
+                case 'or':
+                    tokens.push({ type: TokenType.OR, line, column: startColumn });
+                    break;
+                case 'xor':
+                    tokens.push({ type: TokenType.XOR, line, column: startColumn });
+                    break;
+                case 'not':
+                    tokens.push({ type: TokenType.NOT, line, column: startColumn });
+                    break;
+                case 'case':
+                    tokens.push({ type: TokenType.CASE, line, column: startColumn });
+                    break;
+                case 'of':
+                    tokens.push({ type: TokenType.OF, line, column: startColumn });
+                    break;
+                case 'when':
+                    tokens.push({ type: TokenType.WHEN, line, column: startColumn });
+                    break;
+                case 'is':
+                    tokens.push({ type: TokenType.IS, line, column: startColumn });
+                    break;
+                case 'then':
+                    tokens.push({ type: TokenType.THEN, line, column: startColumn });
+                    break;
+                case 'function':
+                    tokens.push({ type: TokenType.FUNCTION, line, column: startColumn });
+                    break;
+                case '_':
+                    tokens.push({ type: TokenType.WILDCARD, line, column: startColumn });
+                    break;
+                default:
+                    tokens.push({ type: TokenType.IDENTIFIER, value, line, column: startColumn });
+            }
+            continue;
+        }
+
+        // Strings
+        if (char === '"') {
+            let value = '';
+            current++;
+            column++;
+            while (current < input.length && input[current] !== '"') {
+                if (input[current] === '\\') {
+                    current++;
+                    column++;
+                    if (current < input.length) {
+                        switch (input[current]) {
+                            case 'n': value += '\n'; break;
+                            case 't': value += '\t'; break;
+                            case 'r': value += '\r'; break;
+                            case '\\': value += '\\'; break;
+                            case '"': value += '"'; break;
+                            default: value += input[current];
+                        }
+                    }
+                } else {
+                    value += input[current];
+                }
+                current++;
+                column++;
+            }
+            if (current < input.length) {
+                current++;
+                column++;
+            }
+            tokens.push({ type: TokenType.STRING, value, line, column: column - value.length - 2 });
+            continue;
+        }
+
+        // Operators and punctuation
+        switch (char) {
+            case '+':
+                tokens.push({ type: TokenType.PLUS, line, column });
+                break;
+            case '-':
+                if (input[current + 1] === '>') {
+                    tokens.push({ type: TokenType.ARROW, line, column });
+                    current++;
+                    column++;
+                } else {
+                    tokens.push({ type: TokenType.MINUS, line, column });
+                }
+                break;
+            case '*':
+                tokens.push({ type: TokenType.MULTIPLY, line, column });
+                break;
+            case '/':
+                tokens.push({ type: TokenType.DIVIDE, line, column });
+                break;
+            case '%':
+                tokens.push({ type: TokenType.MODULO, line, column });
+                break;
+            case '^':
+                tokens.push({ type: TokenType.POWER, line, column });
+                break;
+            case '(':
+                tokens.push({ type: TokenType.LEFT_PAREN, line, column });
+                break;
+            case ')':
+                tokens.push({ type: TokenType.RIGHT_PAREN, line, column });
+                break;
+            case '{':
+                tokens.push({ type: TokenType.LEFT_BRACE, line, column });
+                break;
+            case '}':
+                tokens.push({ type: TokenType.RIGHT_BRACE, line, column });
+                break;
+            case '[':
+                tokens.push({ type: TokenType.LEFT_BRACKET, line, column });
+                break;
+            case ']':
+                tokens.push({ type: TokenType.RIGHT_BRACKET, line, column });
+                break;
+            case ';':
+                tokens.push({ type: TokenType.SEMICOLON, line, column });
+                break;
+            case ',':
+                tokens.push({ type: TokenType.COMMA, line, column });
+                break;
+            case '.':
+                tokens.push({ type: TokenType.DOT, line, column });
+                break;
+            case ':':
+                tokens.push({ type: TokenType.ASSIGNMENT, line, column });
+                break;
+
+            case '=':
+                if (input[current + 1] === '=') {
+                    tokens.push({ type: TokenType.EQUALS, line, column });
+                    current++;
+                    column++;
+                } else {
+                    // Single = is used for equality comparison in assertions
+                    tokens.push({ type: TokenType.EQUALS, line, column });
+                }
+                break;
+            case '<':
+                if (input[current + 1] === '=') {
+                    tokens.push({ type: TokenType.LESS_EQUAL, line, column });
+                    current++;
+                    column++;
+                } else {
+                    tokens.push({ type: TokenType.LESS_THAN, line, column });
+                }
+                break;
+            case '>':
+                if (input[current + 1] === '=') {
+                    tokens.push({ type: TokenType.GREATER_EQUAL, line, column });
+                    current++;
+                    column++;
+                } else {
+                    tokens.push({ type: TokenType.GREATER_THAN, line, column });
+                }
+                break;
+            case '!':
+                if (input[current + 1] === '=') {
+                    tokens.push({ type: TokenType.NOT_EQUAL, line, column });
+                    current++;
+                    column++;
+                } else {
+                    throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`);
+                }
+                break;
+            default:
+                throw new Error(`Unexpected character: ${char} at line ${line}, column ${column}`);
+        }
+
+        current++;
+        column++;
+    }
+
+    return tokens;
+} 
\ No newline at end of file