diff options
Diffstat (limited to 'js/scripting-lang/baba-yaga-c/src/lexer.c')
-rw-r--r-- | js/scripting-lang/baba-yaga-c/src/lexer.c | 826 |
1 files changed, 0 insertions, 826 deletions
diff --git a/js/scripting-lang/baba-yaga-c/src/lexer.c b/js/scripting-lang/baba-yaga-c/src/lexer.c deleted file mode 100644 index 31a582f..0000000 --- a/js/scripting-lang/baba-yaga-c/src/lexer.c +++ /dev/null @@ -1,826 +0,0 @@ -/** - * @file lexer.c - * @brief Lexer implementation for Baba Yaga - * @author eli_oat - * @version 0.0.1 - * @date 2025 - * - * This file implements the lexical analyzer for the Baba Yaga language. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include <math.h> - -#include "baba_yaga.h" - -/* ============================================================================ - * Token Types - * ============================================================================ */ - -typedef enum { - /* End of file */ - TOKEN_EOF, - - /* Literals */ - TOKEN_NUMBER, - TOKEN_STRING, - TOKEN_BOOLEAN, - - /* Identifiers and keywords */ - TOKEN_IDENTIFIER, - TOKEN_KEYWORD_WHEN, - TOKEN_KEYWORD_IS, - TOKEN_KEYWORD_THEN, - TOKEN_KEYWORD_AND, - TOKEN_KEYWORD_OR, - TOKEN_KEYWORD_XOR, - TOKEN_KEYWORD_NOT, - TOKEN_KEYWORD_VIA, - - /* Operators */ - TOKEN_OP_PLUS, - TOKEN_OP_MINUS, - TOKEN_OP_UNARY_MINUS, - TOKEN_OP_MULTIPLY, - TOKEN_OP_DIVIDE, - TOKEN_OP_MODULO, - TOKEN_OP_POWER, - TOKEN_OP_EQUALS, - TOKEN_OP_NOT_EQUALS, - TOKEN_OP_LESS, - TOKEN_OP_LESS_EQUAL, - TOKEN_OP_GREATER, - TOKEN_OP_GREATER_EQUAL, - - /* Punctuation */ - TOKEN_LPAREN, - TOKEN_RPAREN, - TOKEN_LBRACE, - TOKEN_RBRACE, - TOKEN_LBRACKET, - TOKEN_RBRACKET, - TOKEN_COMMA, - TOKEN_COLON, - TOKEN_SEMICOLON, - TOKEN_ARROW, - TOKEN_DOT, - - /* Special tokens */ - TOKEN_FUNCTION_REF, /* @function */ - TOKEN_IO_IN, /* ..in */ - TOKEN_IO_OUT, /* ..out */ - TOKEN_IO_ASSERT, /* ..assert */ - TOKEN_IO_EMIT, /* ..emit */ - TOKEN_IO_LISTEN /* ..listen */ -} TokenType; - -/* ============================================================================ - * Token Structure - * ============================================================================ */ - -typedef struct { - TokenType type; - char* lexeme; - int line; - int column; - union { - double number; - bool boolean; - } literal; -} Token; - -/* ============================================================================ - * Lexer Structure - * ============================================================================ */ - -typedef struct { - const char* source; - size_t source_len; - size_t position; - int line; - int column; - Token current_token; - bool has_error; - char* error_message; -} Lexer; - -/* ============================================================================ - * Token Helper Functions - * ============================================================================ */ - -/** - * @brief Create a simple token - * - * @param type Token type - * @param lexeme Token lexeme - * @param line Line number - * @param column Column number - * @return New token - */ -static Token token_create(TokenType type, const char* lexeme, int line, int column) { - Token token; - token.type = type; - token.lexeme = lexeme != NULL ? strdup(lexeme) : NULL; - token.line = line; - token.column = column; - token.literal.number = 0.0; /* Initialize union */ - return token; -} - -/* ============================================================================ - * Lexer Functions - * ============================================================================ */ - -/** - * @brief Create a new lexer - * - * @param source Source code to tokenize - * @param source_len Length of source code - * @return New lexer instance, or NULL on failure - */ -static Lexer* lexer_create(const char* source, size_t source_len) { - Lexer* lexer = malloc(sizeof(Lexer)); - if (lexer == NULL) { - return NULL; - } - - lexer->source = source; - lexer->source_len = source_len; - lexer->position = 0; - lexer->line = 1; - lexer->column = 1; - lexer->has_error = false; - lexer->error_message = NULL; - - /* Initialize current token */ - lexer->current_token.type = TOKEN_EOF; - lexer->current_token.lexeme = NULL; - lexer->current_token.line = 1; - lexer->current_token.column = 1; - - return lexer; -} - -/** - * @brief Destroy a lexer - * - * @param lexer Lexer to destroy - */ -static void lexer_destroy(Lexer* lexer) { - if (lexer == NULL) { - return; - } - - if (lexer->current_token.lexeme != NULL) { - free(lexer->current_token.lexeme); - } - - if (lexer->error_message != NULL) { - free(lexer->error_message); - } - - free(lexer); -} - -/** - * @brief Set lexer error - * - * @param lexer Lexer instance - * @param message Error message - */ -static void lexer_set_error(Lexer* lexer, const char* message) { - if (lexer == NULL) { - return; - } - - lexer->has_error = true; - if (lexer->error_message != NULL) { - free(lexer->error_message); - } - lexer->error_message = strdup(message); -} - -/** - * @brief Check if we're at the end of input - * - * @param lexer Lexer instance - * @return true if at end, false otherwise - */ -static bool lexer_is_at_end(const Lexer* lexer) { - return lexer->position >= lexer->source_len; -} - -/** - * @brief Peek at current character - * - * @param lexer Lexer instance - * @return Current character, or '\0' if at end - */ -static char lexer_peek(const Lexer* lexer) { - if (lexer_is_at_end(lexer)) { - return '\0'; - } - return lexer->source[lexer->position]; -} - -/** - * @brief Peek at next character - * - * @param lexer Lexer instance - * @return Next character, or '\0' if at end - */ -static char lexer_peek_next(const Lexer* lexer) { - if (lexer->position + 1 >= lexer->source_len) { - return '\0'; - } - return lexer->source[lexer->position + 1]; -} - -/** - * @brief Advance to next character - * - * @param lexer Lexer instance - * @return Character that was advanced over - */ -static char lexer_advance(Lexer* lexer) { - if (lexer_is_at_end(lexer)) { - return '\0'; - } - - char c = lexer->source[lexer->position]; - lexer->position++; - lexer->column++; - - if (c == '\n') { - lexer->line++; - lexer->column = 1; - } - - return c; -} - -/** - * @brief Match current character and advance if it matches - * - * @param lexer Lexer instance - * @param expected Expected character - * @return true if matched, false otherwise - */ -static bool lexer_match(Lexer* lexer, char expected) { - if (lexer_is_at_end(lexer)) { - return false; - } - - if (lexer->source[lexer->position] != expected) { - return false; - } - - lexer_advance(lexer); - return true; -} - -/** - * @brief Skip whitespace - * - * @param lexer Lexer instance - */ -static void lexer_skip_whitespace(Lexer* lexer) { - while (!lexer_is_at_end(lexer) && isspace(lexer_peek(lexer))) { - lexer_advance(lexer); - } -} - -/** - * @brief Skip comments - * - * @param lexer Lexer instance - */ -static void lexer_skip_comments(Lexer* lexer) { - if (lexer_peek(lexer) == '/' && lexer_peek_next(lexer) == '/') { - /* Single line comment */ - while (!lexer_is_at_end(lexer) && lexer_peek(lexer) != '\n') { - lexer_advance(lexer); - } - } else if (lexer_peek(lexer) == '/' && lexer_peek_next(lexer) == '*') { - /* Multi-line comment */ - lexer_advance(lexer); /* consume '/' */ - lexer_advance(lexer); /* consume '*' */ - - while (!lexer_is_at_end(lexer)) { - if (lexer_peek(lexer) == '*' && lexer_peek_next(lexer) == '/') { - lexer_advance(lexer); /* consume '*' */ - lexer_advance(lexer); /* consume '/' */ - break; - } - lexer_advance(lexer); - } - } -} - -/** - * @brief Read a number literal - * - * @param lexer Lexer instance - * @return Token with number literal - */ -static Token lexer_read_number(Lexer* lexer) { - Token token; - token.type = TOKEN_NUMBER; - token.line = lexer->line; - token.column = lexer->column; - - /* Read integer part */ - while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { - lexer_advance(lexer); - } - - /* Read decimal part */ - if (!lexer_is_at_end(lexer) && lexer_peek(lexer) == '.' && - isdigit(lexer_peek_next(lexer))) { - lexer_advance(lexer); /* consume '.' */ - - while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { - lexer_advance(lexer); - } - } - - /* Read exponent part */ - if (!lexer_is_at_end(lexer) && (lexer_peek(lexer) == 'e' || lexer_peek(lexer) == 'E')) { - lexer_advance(lexer); /* consume 'e' or 'E' */ - - if (!lexer_is_at_end(lexer) && (lexer_peek(lexer) == '+' || lexer_peek(lexer) == '-')) { - lexer_advance(lexer); /* consume sign */ - } - - while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { - lexer_advance(lexer); - } - } - - /* Extract lexeme and convert to number */ - size_t start = lexer->position - (lexer->column - token.column); - size_t length = lexer->position - start; - - token.lexeme = malloc(length + 1); - if (token.lexeme == NULL) { - lexer_set_error(lexer, "Memory allocation failed"); - token.type = TOKEN_EOF; - return token; - } - - strncpy(token.lexeme, lexer->source + start, length); - token.lexeme[length] = '\0'; - - token.literal.number = atof(token.lexeme); - - return token; -} - -/** - * @brief Read a string literal - * - * @param lexer Lexer instance - * @return Token with string literal - */ -static Token lexer_read_string(Lexer* lexer) { - Token token; - token.type = TOKEN_STRING; - token.line = lexer->line; - token.column = lexer->column; - - lexer_advance(lexer); /* consume opening quote */ - - size_t start = lexer->position; - size_t length = 0; - - while (!lexer_is_at_end(lexer) && lexer_peek(lexer) != '"') { - if (lexer_peek(lexer) == '\\' && !lexer_is_at_end(lexer)) { - lexer_advance(lexer); /* consume backslash */ - if (!lexer_is_at_end(lexer)) { - lexer_advance(lexer); /* consume escaped character */ - } - } else { - lexer_advance(lexer); - } - length++; - } - - if (lexer_is_at_end(lexer)) { - lexer_set_error(lexer, "Unterminated string literal"); - token.type = TOKEN_EOF; - return token; - } - - lexer_advance(lexer); /* consume closing quote */ - - /* Extract lexeme */ - token.lexeme = malloc(length + 1); - if (token.lexeme == NULL) { - lexer_set_error(lexer, "Memory allocation failed"); - token.type = TOKEN_EOF; - return token; - } - - strncpy(token.lexeme, lexer->source + start, length); - token.lexeme[length] = '\0'; - - return token; -} - -/** - * @brief Read an identifier or keyword - * - * @param lexer Lexer instance - * @return Token with identifier or keyword - */ -static Token lexer_read_identifier(Lexer* lexer) { - Token token; - token.line = lexer->line; - token.column = lexer->column; - - size_t start = lexer->position; - size_t length = 0; - - while (!lexer_is_at_end(lexer) && - (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { - lexer_advance(lexer); - length++; - } - - /* Extract lexeme */ - token.lexeme = malloc(length + 1); - if (token.lexeme == NULL) { - lexer_set_error(lexer, "Memory allocation failed"); - token.type = TOKEN_EOF; - return token; - } - - strncpy(token.lexeme, lexer->source + start, length); - token.lexeme[length] = '\0'; - - /* Check if it's a keyword */ - if (strcmp(token.lexeme, "when") == 0) { - - token.type = TOKEN_KEYWORD_WHEN; - } else if (strcmp(token.lexeme, "is") == 0) { - token.type = TOKEN_KEYWORD_IS; - } else if (strcmp(token.lexeme, "then") == 0) { - token.type = TOKEN_KEYWORD_THEN; - } else if (strcmp(token.lexeme, "not") == 0) { - token.type = TOKEN_KEYWORD_NOT; - } else if (strcmp(token.lexeme, "via") == 0) { - token.type = TOKEN_KEYWORD_VIA; - } else if (strcmp(token.lexeme, "true") == 0) { - token.type = TOKEN_BOOLEAN; - token.literal.boolean = true; - } else if (strcmp(token.lexeme, "false") == 0) { - token.type = TOKEN_BOOLEAN; - token.literal.boolean = false; - } else { - token.type = TOKEN_IDENTIFIER; - } - - return token; -} - -/** - * @brief Read a special token (function reference, IO operations) - * - * @param lexer Lexer instance - * @return Token with special type - */ -static Token lexer_read_special(Lexer* lexer) { - Token token; - token.line = lexer->line; - token.column = lexer->column; - - if (lexer_peek(lexer) == '@') { - /* Function reference */ - lexer_advance(lexer); /* consume '@' */ - - /* Check if this is @(expression) syntax */ - if (!lexer_is_at_end(lexer) && lexer_peek(lexer) == '(') { - /* Just return the @ token for @(expression) syntax */ - token.type = TOKEN_FUNCTION_REF; - token.lexeme = malloc(2); /* +1 for '@' and '\0' */ - if (token.lexeme == NULL) { - lexer_set_error(lexer, "Memory allocation failed"); - token.type = TOKEN_EOF; - return token; - } - token.lexeme[0] = '@'; - token.lexeme[1] = '\0'; - } else { - /* Handle @function_name syntax */ - size_t start = lexer->position; - size_t length = 0; - - while (!lexer_is_at_end(lexer) && - (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { - lexer_advance(lexer); - length++; - } - - if (length == 0) { - lexer_set_error(lexer, "Invalid function reference"); - token.type = TOKEN_EOF; - return token; - } - - token.type = TOKEN_FUNCTION_REF; - token.lexeme = malloc(length + 2); /* +2 for '@' and '\0' */ - if (token.lexeme == NULL) { - lexer_set_error(lexer, "Memory allocation failed"); - token.type = TOKEN_EOF; - return token; - } - - token.lexeme[0] = '@'; - strncpy(token.lexeme + 1, lexer->source + start, length); - token.lexeme[length + 1] = '\0'; - } - - } else if (lexer_peek(lexer) == '.' && lexer_peek_next(lexer) == '.') { - /* IO operation */ - lexer_advance(lexer); /* consume first '.' */ - lexer_advance(lexer); /* consume second '.' */ - - size_t start = lexer->position; - size_t length = 0; - - while (!lexer_is_at_end(lexer) && - (isalpha(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { - lexer_advance(lexer); - length++; - } - - if (length == 0) { - lexer_set_error(lexer, "Invalid IO operation"); - token.type = TOKEN_EOF; - return token; - } - - token.lexeme = malloc(length + 3); /* +3 for '..', operation, and '\0' */ - if (token.lexeme == NULL) { - lexer_set_error(lexer, "Memory allocation failed"); - token.type = TOKEN_EOF; - return token; - } - - token.lexeme[0] = '.'; - token.lexeme[1] = '.'; - strncpy(token.lexeme + 2, lexer->source + start, length); - token.lexeme[length + 2] = '\0'; - - /* Determine IO operation type */ - if (strcmp(token.lexeme, "..in") == 0) { - token.type = TOKEN_IO_IN; - } else if (strcmp(token.lexeme, "..out") == 0) { - token.type = TOKEN_IO_OUT; - } else if (strcmp(token.lexeme, "..assert") == 0) { - token.type = TOKEN_IO_ASSERT; - } else if (strcmp(token.lexeme, "..emit") == 0) { - token.type = TOKEN_IO_EMIT; - } else if (strcmp(token.lexeme, "..listen") == 0) { - token.type = TOKEN_IO_LISTEN; - } else { - lexer_set_error(lexer, "Unknown IO operation"); - token.type = TOKEN_EOF; - free(token.lexeme); - return token; - } - } - - return token; -} - -/** - * @brief Read the next token - * - * @param lexer Lexer instance - * @return Next token - */ -static Token lexer_next_token(Lexer* lexer) { - /* Skip whitespace and comments */ - while (!lexer_is_at_end(lexer)) { - lexer_skip_whitespace(lexer); - lexer_skip_comments(lexer); - - /* Check if we still have whitespace after comments */ - if (!lexer_is_at_end(lexer) && isspace(lexer_peek(lexer))) { - continue; - } - break; - } - - if (lexer_is_at_end(lexer)) { - Token token; - token.type = TOKEN_EOF; - token.lexeme = NULL; - token.line = lexer->line; - token.column = lexer->column; - return token; - } - - char c = lexer_peek(lexer); - - /* Numbers */ - if (isdigit(c)) { - return lexer_read_number(lexer); - } - - /* Strings */ - if (c == '"') { - return lexer_read_string(lexer); - } - - /* Special tokens */ - if (c == '@' || (c == '.' && lexer_peek_next(lexer) == '.')) { - return lexer_read_special(lexer); - } - - /* Identifiers and keywords */ - if (isalpha(c) || c == '_') { - return lexer_read_identifier(lexer); - } - - /* Single character tokens */ - switch (c) { - case '(': - lexer_advance(lexer); - return token_create(TOKEN_LPAREN, "(", lexer->line, lexer->column - 1); - case ')': - lexer_advance(lexer); - return token_create(TOKEN_RPAREN, ")", lexer->line, lexer->column - 1); - case '{': - lexer_advance(lexer); - return token_create(TOKEN_LBRACE, "{", lexer->line, lexer->column - 1); - case '}': - lexer_advance(lexer); - return token_create(TOKEN_RBRACE, "}", lexer->line, lexer->column - 1); - case '[': - lexer_advance(lexer); - return token_create(TOKEN_LBRACKET, "[", lexer->line, lexer->column - 1); - case ']': - lexer_advance(lexer); - return token_create(TOKEN_RBRACKET, "]", lexer->line, lexer->column - 1); - case ',': - lexer_advance(lexer); - return token_create(TOKEN_COMMA, ",", lexer->line, lexer->column - 1); - case ':': - lexer_advance(lexer); - return token_create(TOKEN_COLON, ":", lexer->line, lexer->column - 1); - case ';': - lexer_advance(lexer); - return token_create(TOKEN_SEMICOLON, ";", lexer->line, lexer->column - 1); - case '.': - lexer_advance(lexer); - return token_create(TOKEN_DOT, ".", lexer->line, lexer->column - 1); - case '-': - lexer_advance(lexer); - if (lexer_match(lexer, '>')) { - return token_create(TOKEN_ARROW, "->", lexer->line, lexer->column - 2); - } - - /* Check if this is a unary minus (followed by a digit, identifier, or parentheses) */ - if ((lexer_peek(lexer) >= '0' && lexer_peek(lexer) <= '9') || - (lexer_peek(lexer) >= 'a' && lexer_peek(lexer) <= 'z') || - (lexer_peek(lexer) >= 'A' && lexer_peek(lexer) <= 'Z') || - (lexer_peek(lexer) == '_') || - (lexer_peek(lexer) == '(')) { - return token_create(TOKEN_OP_UNARY_MINUS, "-", lexer->line, lexer->column - 1); - } - /* Otherwise treat as binary minus */ - return token_create(TOKEN_OP_MINUS, "-", lexer->line, lexer->column - 1); - case '+': - lexer_advance(lexer); - return token_create(TOKEN_OP_PLUS, "+", lexer->line, lexer->column - 1); - case '*': - lexer_advance(lexer); - return token_create(TOKEN_OP_MULTIPLY, "*", lexer->line, lexer->column - 1); - case '/': - lexer_advance(lexer); - return token_create(TOKEN_OP_DIVIDE, "/", lexer->line, lexer->column - 1); - case '%': - lexer_advance(lexer); - return token_create(TOKEN_OP_MODULO, "%", lexer->line, lexer->column - 1); - case '^': - lexer_advance(lexer); - return token_create(TOKEN_OP_POWER, "^", lexer->line, lexer->column - 1); - case '=': - lexer_advance(lexer); - if (lexer_match(lexer, '=')) { - return token_create(TOKEN_OP_EQUALS, "==", lexer->line, lexer->column - 2); - } - return token_create(TOKEN_OP_EQUALS, "=", lexer->line, lexer->column - 1); - case '!': - lexer_advance(lexer); - if (lexer_match(lexer, '=')) { - return token_create(TOKEN_OP_NOT_EQUALS, "!=", lexer->line, lexer->column - 2); - } - break; - case '<': - lexer_advance(lexer); - if (lexer_match(lexer, '=')) { - return token_create(TOKEN_OP_LESS_EQUAL, "<=", lexer->line, lexer->column - 2); - } - return token_create(TOKEN_OP_LESS, "<", lexer->line, lexer->column - 1); - case '>': - lexer_advance(lexer); - if (lexer_match(lexer, '=')) { - return token_create(TOKEN_OP_GREATER_EQUAL, ">=", lexer->line, lexer->column - 2); - } - return token_create(TOKEN_OP_GREATER, ">", lexer->line, lexer->column - 1); - } - - /* Unknown character */ - char error_msg[64]; - snprintf(error_msg, sizeof(error_msg), "Unexpected character: '%c'", c); - lexer_set_error(lexer, error_msg); - - Token token; - token.type = TOKEN_EOF; - token.lexeme = NULL; - token.line = lexer->line; - token.column = lexer->column; - return token; -} - -/* ============================================================================ - * Public Lexer API - * ============================================================================ */ - -/** - * @brief Tokenize source code - * - * @param source Source code to tokenize - * @param source_len Length of source code - * @param tokens Output array for tokens - * @param max_tokens Maximum number of tokens to read - * @return Number of tokens read, or -1 on error - */ -int baba_yaga_tokenize(const char* source, size_t source_len, - void** tokens, size_t max_tokens) { - if (source == NULL || tokens == NULL) { - return -1; - } - - Lexer* lexer = lexer_create(source, source_len); - if (lexer == NULL) { - return -1; - } - - size_t token_count = 0; - - while (token_count < max_tokens) { - Token token = lexer_next_token(lexer); - - if (lexer->has_error) { - lexer_destroy(lexer); - return -1; - } - - if (token.type == TOKEN_EOF) { - break; - } - - /* Allocate token and copy data */ - Token* token_ptr = malloc(sizeof(Token)); - if (token_ptr == NULL) { - lexer_destroy(lexer); - return -1; - } - - *token_ptr = token; - tokens[token_count] = token_ptr; - token_count++; - } - - lexer_destroy(lexer); - return (int)token_count; -} - -/** - * @brief Free tokens - * - * @param tokens Array of tokens - * @param count Number of tokens - */ -void baba_yaga_free_tokens(void** tokens, size_t count) { - if (tokens == NULL) { - return; - } - - for (size_t i = 0; i < count; i++) { - if (tokens[i] != NULL) { - Token* token = (Token*)tokens[i]; - if (token->lexeme != NULL) { - free(token->lexeme); - } - free(token); - } - } -} |