about summary refs log tree commit diff stats
path: root/js/baba-yaga/src/legacy/lexer.js
diff options
context:
space:
mode:
Diffstat (limited to 'js/baba-yaga/src/legacy/lexer.js')
-rw-r--r--js/baba-yaga/src/legacy/lexer.js425
1 files changed, 425 insertions, 0 deletions
diff --git a/js/baba-yaga/src/legacy/lexer.js b/js/baba-yaga/src/legacy/lexer.js
new file mode 100644
index 0000000..054dd0e
--- /dev/null
+++ b/js/baba-yaga/src/legacy/lexer.js
@@ -0,0 +1,425 @@
+// lexer.js
+
+import { LexError, ErrorHelpers } from '../core/error.js';
+
+const tokenTypes = {
+  IDENTIFIER: 'IDENTIFIER',
+  TYPE: 'TYPE',
+  NUMBER: 'NUMBER',
+  STRING: 'STRING',
+  ARROW: 'ARROW',
+  COLON: 'COLON',
+  SEMICOLON: 'SEMICOLON',
+  COMMA: 'COMMA',
+  KEYWORD: 'KEYWORD',
+  OPERATOR: 'OPERATOR',
+  LPAREN: 'LPAREN',
+  RPAREN: 'RPAREN',
+  DOT: 'DOT',
+  LBRACKET: 'LBRACKET',
+  RBRACKET: 'RBRACKET',
+  LBRACE: 'LBRACE',
+  RBRACE: 'RBRACE',
+  EOF: 'EOF',
+};
+
+const keywords = ['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor'];
+
+function createLexer(input) {
+  let position = 0;
+  let line = 1;
+  let column = 1;
+
+  function isWhitespace(char) {
+    return /\s/.test(char);
+  }
+
+  function isDigit(char) {
+    return /\d/.test(char);
+  }
+
+  function isLetter(char) {
+    return /[a-zA-Z_0-9]/.test(char);
+  }
+
+  function readWhile(predicate) {
+    let str = '';
+    while (position < input.length && predicate(input[position])) {
+      str += input[position];
+      position++;
+      column++;
+    }
+    return str;
+  }
+
+  function readString() {
+    let str = '';
+    const startLine = line;
+    const startColumn = column;
+    
+    position++; // Skip the opening quote
+    column++;
+    
+    while (position < input.length && input[position] !== '"') {
+      const char = input[position];
+      
+      // Handle newlines in strings
+      if (char === '\n') {
+        line++;
+        column = 1;
+      } else {
+        column++;
+      }
+      
+      // Handle escape sequences
+      if (char === '\\' && position + 1 < input.length) {
+        const nextChar = input[position + 1];
+        switch (nextChar) {
+          case 'n':
+            str += '\n';
+            position += 2;
+            column++;
+            break;
+          case 't':
+            str += '\t';
+            position += 2;
+            column++;
+            break;
+          case 'r':
+            str += '\r';
+            position += 2;
+            column++;
+            break;
+          case '\\':
+            str += '\\';
+            position += 2;
+            column++;
+            break;
+          case '"':
+            str += '"';
+            position += 2;
+            column++;
+            break;
+          default:
+            str += char;
+            position++;
+        }
+      } else {
+        str += char;
+        position++;
+      }
+    }
+    
+    // Check for unterminated string
+    if (position >= input.length) {
+      throw new LexError(
+        'Unterminated string literal',
+        { line: startLine, column: startColumn, length: str.length + 1 },
+        input,
+        [
+          'Add closing quote " at the end of the string',
+          'Check for unescaped quotes inside the string',
+          'Use \\" to include quotes in strings'
+        ]
+      );
+    }
+    
+    position++; // Skip the closing quote
+    column++;
+    return { type: tokenTypes.STRING, value: str, line: startLine, column: startColumn };
+  }
+
+  function readNumber() {
+    let value = readWhile(isDigit);
+    let isFloat = false;
+    if (peekChar() === '.') {
+      position++;
+      column++;
+      value += '.' + readWhile(isDigit);
+      isFloat = true;
+    }
+    
+    const numericValue = isFloat ? parseFloat(value) : parseInt(value, 10);
+    return { 
+      type: tokenTypes.NUMBER, 
+      value: numericValue, 
+      isFloat: isFloat,
+      originalString: value,
+      line, 
+      column 
+    };
+  }
+
+  function peekChar() {
+    return input[position];
+  }
+
+  function shouldBeNegativeLiteral() {
+    // Look at the previous non-whitespace token to decide
+    let prevPos = position - 1;
+    while (prevPos >= 0 && isWhitespace(input[prevPos])) {
+      prevPos--;
+    }
+    
+    if (prevPos < 0) {
+      // At start of input - should be negative literal
+      return true;
+    }
+    
+    const prevChar = input[prevPos];
+    
+    // After opening parenthesis, comma, or operators - should be negative literal
+    if (prevChar === '(' || prevChar === ',' || prevChar === '+' || 
+        prevChar === '*' || prevChar === '/' || prevChar === '%' ||
+        prevChar === '=' || prevChar === '>' || prevChar === '<' ||
+        prevChar === ':' || prevChar === ';') {
+      return true;
+    }
+    
+    // After closing parenthesis - should be binary minus
+    if (prevChar === ')') {
+      return false;
+    }
+    
+    // After numbers - this is tricky. In most cases it should be binary minus,
+    // but in function call contexts it might be a negative literal.
+    // Let's look ahead to see if this is likely a function call context.
+    if (isDigit(prevChar)) {
+      // Look ahead to see if we're in a function call context
+      // If we see whitespace followed by another minus, it's probably a negative literal
+      let lookAheadPos = position + 1;
+      while (lookAheadPos < input.length && isWhitespace(input[lookAheadPos])) {
+        lookAheadPos++;
+      }
+      if (lookAheadPos < input.length && input[lookAheadPos] === '-') {
+        // This looks like a function call with consecutive negative arguments
+        return true;
+      }
+      return false; // Default to binary minus
+    }
+    
+    // After identifiers - could be either, but in most contexts it's a negative literal
+    // (function calls, variable declarations, etc.)
+    if (isLetter(prevChar)) {
+      return true;
+    }
+    
+    // Default to negative literal
+    return true;
+  }
+
+  function readNegativeNumber() {
+    // Consume the minus sign
+    position++;
+    column++;
+    
+    // Read the number part
+    let value = '-' + readWhile(isDigit);
+    let isFloat = false;
+    
+    if (peekChar() === '.') {
+      position++;
+      column++;
+      value += '.' + readWhile(isDigit);
+      isFloat = true;
+    }
+    
+    const numericValue = isFloat ? parseFloat(value) : parseInt(value, 10);
+    return { 
+      type: tokenTypes.NUMBER, 
+      value: numericValue, 
+      isFloat: isFloat,
+      originalString: value,
+      line, 
+      column 
+    };
+  }
+
+  function nextToken() {
+    if (position >= input.length) {
+      return { type: tokenTypes.EOF, line, column };
+    }
+
+    let char = input[position];
+
+    if (isWhitespace(char)) {
+      if (char === '\n') {
+        line++;
+        column = 1;
+      } else {
+        column++;
+      }
+      position++;
+      return nextToken();
+    }
+
+    if (char === '/' && input[position + 1] === '/') {
+      while (position < input.length && input[position] !== '\n') {
+        position++;
+        column++;
+      }
+      return nextToken(); // Skip the comment and get the next real token
+    }
+
+    if (char === '(') {
+      position++;
+      column++;
+      return { type: tokenTypes.LPAREN, value: '(', line, column };
+    }
+
+    if (char === ')') {
+      position++;
+      column++;
+      return { type: tokenTypes.RPAREN, value: ')', line, column };
+    }
+
+    if (char === '[') {
+      position++;
+      column++;
+      return { type: tokenTypes.LBRACKET, value: '[', line, column };
+    }
+
+    if (char === ']') {
+      position++;
+      column++;
+      return { type: tokenTypes.RBRACKET, value: ']', line, column };
+    }
+
+    if (char === '{') {
+      position++;
+      column++;
+      return { type: tokenTypes.LBRACE, value: '{', line, column };
+    }
+
+    if (char === '}') {
+      position++;
+      column++;
+      return { type: tokenTypes.RBRACE, value: '}', line, column };
+    }
+
+    // Handle double dot operator for string concatenation (must come before single dot)
+    if (char === '.' && input[position + 1] === '.') {
+      position += 2;
+      column += 2;
+      return { type: tokenTypes.OPERATOR, value: '..', line, column };
+    }
+
+    if (char === '.') {
+      position++;
+      column++;
+      return { type: tokenTypes.DOT, value: '.', line, column };
+    }
+
+    // Handle negative numbers based on context
+    if (char === '-' && position + 1 < input.length && isDigit(input[position + 1])) {
+      // Check if this should be a negative literal vs binary minus
+      if (shouldBeNegativeLiteral()) {
+        return readNegativeNumber();
+      }
+    }
+
+    if (isDigit(char)) {
+      return readNumber();
+    }
+
+    if (isLetter(char)) {
+      const value = readWhile(isLetter);
+      if (['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool'].includes(value)) {
+        return { type: tokenTypes.TYPE, value, line, column };
+      }
+      if (keywords.includes(value)) {
+        return { type: tokenTypes.KEYWORD, value, line, column };
+      }
+      return { type: tokenTypes.IDENTIFIER, value, line, column };
+    }
+
+    if (char === '"') {
+      return readString();
+    }
+
+    if (char === ':') {
+      position++;
+      column++;
+      return { type: tokenTypes.COLON, value: ':', line, column };
+    }
+    
+    if (char === '-' && input[position + 1] === '>') {
+      position += 2;
+      column += 2;
+      return { type: tokenTypes.ARROW, value: '->', line, column };
+    }
+
+    if (char === ';') {
+      position++;
+      column++;
+      return { type: tokenTypes.SEMICOLON, value: ';', line, column };
+    }
+    
+    // Handle >= and <=
+    if (char === '>' && input[position + 1] === '=') {
+      position += 2;
+      column += 2;
+      return { type: tokenTypes.OPERATOR, value: '>=', line, column };
+    }
+    if (char === '<' && input[position + 1] === '=') {
+      position += 2;
+      column += 2;
+      return { type: tokenTypes.OPERATOR, value: '<=', line, column };
+    }
+    
+    // Handle != (not equal)
+    if (char === '!' && input[position + 1] === '=') {
+      position += 2;
+      column += 2;
+      return { type: tokenTypes.OPERATOR, value: '!=', line, column };
+    }
+    
+    if (char === ',') {
+        position++;
+        column++;
+        return { type: tokenTypes.COMMA, value: ',', line, column };
+    }
+    
+    if (['+', '-', '*', '/', '=', '>', '<', '%'].includes(char)) {
+        position++;
+        column++;
+        return { type: tokenTypes.OPERATOR, value: char, line, column };
+    }
+
+    const suggestions = [];
+    
+    // Common character mistakes
+    if (char === '"' || char === '"') {
+      suggestions.push('Use straight quotes " instead of curly quotes');
+    } else if (char === '–' || char === '—') {
+      suggestions.push('Use regular minus - or arrow -> instead of em/en dash');
+    } else if (/[^\x00-\x7F]/.test(char)) {
+      suggestions.push('Use only ASCII characters in Baba Yaga code');
+    } else {
+      suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`);
+    }
+    
+    throw new LexError(
+      `Unexpected character: ${JSON.stringify(char)}`,
+      { line, column, length: 1 },
+      input,
+      suggestions
+    );
+  }
+
+  function allTokens() {
+    const tokens = [];
+    let token;
+    do {
+      token = nextToken();
+      tokens.push(token);
+    } while (token.type !== tokenTypes.EOF);
+    return tokens;
+  }
+
+  return {
+    allTokens,
+  };
+}
+
+export { createLexer, tokenTypes };