diff options
Diffstat (limited to 'js/scripting-lang/baba-yaga-c/src/lexer.c')
-rw-r--r-- | js/scripting-lang/baba-yaga-c/src/lexer.c | 820 |
1 files changed, 820 insertions, 0 deletions
diff --git a/js/scripting-lang/baba-yaga-c/src/lexer.c b/js/scripting-lang/baba-yaga-c/src/lexer.c new file mode 100644 index 0000000..cc15047 --- /dev/null +++ b/js/scripting-lang/baba-yaga-c/src/lexer.c @@ -0,0 +1,820 @@ +/** + * @file lexer.c + * @brief Lexer implementation for Baba Yaga + * @author eli_oat + * @version 0.0.1 + * @date 2025 + * + * This file implements the lexical analyzer for the Baba Yaga language. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <math.h> + +#include "baba_yaga.h" + +/* ============================================================================ + * Token Types + * ============================================================================ */ + +typedef enum { + /* End of file */ + TOKEN_EOF, + + /* Literals */ + TOKEN_NUMBER, + TOKEN_STRING, + TOKEN_BOOLEAN, + + /* Identifiers and keywords */ + TOKEN_IDENTIFIER, + TOKEN_KEYWORD_WHEN, + TOKEN_KEYWORD_IS, + TOKEN_KEYWORD_THEN, + TOKEN_KEYWORD_AND, + TOKEN_KEYWORD_OR, + TOKEN_KEYWORD_XOR, + TOKEN_KEYWORD_NOT, + TOKEN_KEYWORD_VIA, + + /* Operators */ + TOKEN_OP_PLUS, + TOKEN_OP_MINUS, + TOKEN_OP_UNARY_MINUS, + TOKEN_OP_MULTIPLY, + TOKEN_OP_DIVIDE, + TOKEN_OP_MODULO, + TOKEN_OP_POWER, + TOKEN_OP_EQUALS, + TOKEN_OP_NOT_EQUALS, + TOKEN_OP_LESS, + TOKEN_OP_LESS_EQUAL, + TOKEN_OP_GREATER, + TOKEN_OP_GREATER_EQUAL, + + /* Punctuation */ + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_LBRACE, + TOKEN_RBRACE, + TOKEN_LBRACKET, + TOKEN_RBRACKET, + TOKEN_COMMA, + TOKEN_COLON, + TOKEN_SEMICOLON, + TOKEN_ARROW, + TOKEN_DOT, + + /* Special tokens */ + TOKEN_FUNCTION_REF, /* @function */ + TOKEN_IO_IN, /* ..in */ + TOKEN_IO_OUT, /* ..out */ + TOKEN_IO_ASSERT, /* ..assert */ + + /* Comments */ + TOKEN_COMMENT +} TokenType; + +/* ============================================================================ + * Token Structure + * ============================================================================ */ + +typedef struct { + TokenType type; + char* lexeme; + int line; + int column; + union { + double number; + bool boolean; + } literal; +} Token; + +/* ============================================================================ + * Lexer Structure + * ============================================================================ */ + +typedef struct { + const char* source; + size_t source_len; + size_t position; + int line; + int column; + Token current_token; + bool has_error; + char* error_message; +} Lexer; + +/* ============================================================================ + * Token Helper Functions + * ============================================================================ */ + +/** + * @brief Create a simple token + * + * @param type Token type + * @param lexeme Token lexeme + * @param line Line number + * @param column Column number + * @return New token + */ +static Token token_create(TokenType type, const char* lexeme, int line, int column) { + Token token; + token.type = type; + token.lexeme = lexeme != NULL ? strdup(lexeme) : NULL; + token.line = line; + token.column = column; + token.literal.number = 0.0; /* Initialize union */ + return token; +} + +/* ============================================================================ + * Lexer Functions + * ============================================================================ */ + +/** + * @brief Create a new lexer + * + * @param source Source code to tokenize + * @param source_len Length of source code + * @return New lexer instance, or NULL on failure + */ +static Lexer* lexer_create(const char* source, size_t source_len) { + Lexer* lexer = malloc(sizeof(Lexer)); + if (lexer == NULL) { + return NULL; + } + + lexer->source = source; + lexer->source_len = source_len; + lexer->position = 0; + lexer->line = 1; + lexer->column = 1; + lexer->has_error = false; + lexer->error_message = NULL; + + /* Initialize current token */ + lexer->current_token.type = TOKEN_EOF; + lexer->current_token.lexeme = NULL; + lexer->current_token.line = 1; + lexer->current_token.column = 1; + + return lexer; +} + +/** + * @brief Destroy a lexer + * + * @param lexer Lexer to destroy + */ +static void lexer_destroy(Lexer* lexer) { + if (lexer == NULL) { + return; + } + + if (lexer->current_token.lexeme != NULL) { + free(lexer->current_token.lexeme); + } + + if (lexer->error_message != NULL) { + free(lexer->error_message); + } + + free(lexer); +} + +/** + * @brief Set lexer error + * + * @param lexer Lexer instance + * @param message Error message + */ +static void lexer_set_error(Lexer* lexer, const char* message) { + if (lexer == NULL) { + return; + } + + lexer->has_error = true; + if (lexer->error_message != NULL) { + free(lexer->error_message); + } + lexer->error_message = strdup(message); +} + +/** + * @brief Check if we're at the end of input + * + * @param lexer Lexer instance + * @return true if at end, false otherwise + */ +static bool lexer_is_at_end(const Lexer* lexer) { + return lexer->position >= lexer->source_len; +} + +/** + * @brief Peek at current character + * + * @param lexer Lexer instance + * @return Current character, or '\0' if at end + */ +static char lexer_peek(const Lexer* lexer) { + if (lexer_is_at_end(lexer)) { + return '\0'; + } + return lexer->source[lexer->position]; +} + +/** + * @brief Peek at next character + * + * @param lexer Lexer instance + * @return Next character, or '\0' if at end + */ +static char lexer_peek_next(const Lexer* lexer) { + if (lexer->position + 1 >= lexer->source_len) { + return '\0'; + } + return lexer->source[lexer->position + 1]; +} + +/** + * @brief Advance to next character + * + * @param lexer Lexer instance + * @return Character that was advanced over + */ +static char lexer_advance(Lexer* lexer) { + if (lexer_is_at_end(lexer)) { + return '\0'; + } + + char c = lexer->source[lexer->position]; + lexer->position++; + lexer->column++; + + if (c == '\n') { + lexer->line++; + lexer->column = 1; + } + + return c; +} + +/** + * @brief Match current character and advance if it matches + * + * @param lexer Lexer instance + * @param expected Expected character + * @return true if matched, false otherwise + */ +static bool lexer_match(Lexer* lexer, char expected) { + if (lexer_is_at_end(lexer)) { + return false; + } + + if (lexer->source[lexer->position] != expected) { + return false; + } + + lexer_advance(lexer); + return true; +} + +/** + * @brief Skip whitespace + * + * @param lexer Lexer instance + */ +static void lexer_skip_whitespace(Lexer* lexer) { + while (!lexer_is_at_end(lexer) && isspace(lexer_peek(lexer))) { + lexer_advance(lexer); + } +} + +/** + * @brief Skip comments + * + * @param lexer Lexer instance + */ +static void lexer_skip_comments(Lexer* lexer) { + if (lexer_peek(lexer) == '/' && lexer_peek_next(lexer) == '/') { + /* Single line comment */ + while (!lexer_is_at_end(lexer) && lexer_peek(lexer) != '\n') { + lexer_advance(lexer); + } + } else if (lexer_peek(lexer) == '/' && lexer_peek_next(lexer) == '*') { + /* Multi-line comment */ + lexer_advance(lexer); /* consume '/' */ + lexer_advance(lexer); /* consume '*' */ + + while (!lexer_is_at_end(lexer)) { + if (lexer_peek(lexer) == '*' && lexer_peek_next(lexer) == '/') { + lexer_advance(lexer); /* consume '*' */ + lexer_advance(lexer); /* consume '/' */ + break; + } + lexer_advance(lexer); + } + } +} + +/** + * @brief Read a number literal + * + * @param lexer Lexer instance + * @return Token with number literal + */ +static Token lexer_read_number(Lexer* lexer) { + Token token; + token.type = TOKEN_NUMBER; + token.line = lexer->line; + token.column = lexer->column; + + /* Read integer part */ + while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { + lexer_advance(lexer); + } + + /* Read decimal part */ + if (!lexer_is_at_end(lexer) && lexer_peek(lexer) == '.' && + isdigit(lexer_peek_next(lexer))) { + lexer_advance(lexer); /* consume '.' */ + + while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { + lexer_advance(lexer); + } + } + + /* Read exponent part */ + if (!lexer_is_at_end(lexer) && (lexer_peek(lexer) == 'e' || lexer_peek(lexer) == 'E')) { + lexer_advance(lexer); /* consume 'e' or 'E' */ + + if (!lexer_is_at_end(lexer) && (lexer_peek(lexer) == '+' || lexer_peek(lexer) == '-')) { + lexer_advance(lexer); /* consume sign */ + } + + while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { + lexer_advance(lexer); + } + } + + /* Extract lexeme and convert to number */ + size_t start = lexer->position - (lexer->column - token.column); + size_t length = lexer->position - start; + + token.lexeme = malloc(length + 1); + if (token.lexeme == NULL) { + lexer_set_error(lexer, "Memory allocation failed"); + token.type = TOKEN_EOF; + return token; + } + + strncpy(token.lexeme, lexer->source + start, length); + token.lexeme[length] = '\0'; + + token.literal.number = atof(token.lexeme); + + return token; +} + +/** + * @brief Read a string literal + * + * @param lexer Lexer instance + * @return Token with string literal + */ +static Token lexer_read_string(Lexer* lexer) { + Token token; + token.type = TOKEN_STRING; + token.line = lexer->line; + token.column = lexer->column; + + lexer_advance(lexer); /* consume opening quote */ + + size_t start = lexer->position; + size_t length = 0; + + while (!lexer_is_at_end(lexer) && lexer_peek(lexer) != '"') { + if (lexer_peek(lexer) == '\\' && !lexer_is_at_end(lexer)) { + lexer_advance(lexer); /* consume backslash */ + if (!lexer_is_at_end(lexer)) { + lexer_advance(lexer); /* consume escaped character */ + } + } else { + lexer_advance(lexer); + } + length++; + } + + if (lexer_is_at_end(lexer)) { + lexer_set_error(lexer, "Unterminated string literal"); + token.type = TOKEN_EOF; + return token; + } + + lexer_advance(lexer); /* consume closing quote */ + + /* Extract lexeme */ + token.lexeme = malloc(length + 1); + if (token.lexeme == NULL) { + lexer_set_error(lexer, "Memory allocation failed"); + token.type = TOKEN_EOF; + return token; + } + + strncpy(token.lexeme, lexer->source + start, length); + token.lexeme[length] = '\0'; + + return token; +} + +/** + * @brief Read an identifier or keyword + * + * @param lexer Lexer instance + * @return Token with identifier or keyword + */ +static Token lexer_read_identifier(Lexer* lexer) { + Token token; + token.line = lexer->line; + token.column = lexer->column; + + size_t start = lexer->position; + size_t length = 0; + + while (!lexer_is_at_end(lexer) && + (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { + lexer_advance(lexer); + length++; + } + + /* Extract lexeme */ + token.lexeme = malloc(length + 1); + if (token.lexeme == NULL) { + lexer_set_error(lexer, "Memory allocation failed"); + token.type = TOKEN_EOF; + return token; + } + + strncpy(token.lexeme, lexer->source + start, length); + token.lexeme[length] = '\0'; + + /* Check if it's a keyword */ + if (strcmp(token.lexeme, "when") == 0) { + token.type = TOKEN_KEYWORD_WHEN; + } else if (strcmp(token.lexeme, "is") == 0) { + token.type = TOKEN_KEYWORD_IS; + } else if (strcmp(token.lexeme, "and") == 0) { + token.type = TOKEN_KEYWORD_AND; + } else if (strcmp(token.lexeme, "or") == 0) { + token.type = TOKEN_KEYWORD_OR; + } else if (strcmp(token.lexeme, "xor") == 0) { + token.type = TOKEN_KEYWORD_XOR; + } else if (strcmp(token.lexeme, "then") == 0) { + token.type = TOKEN_KEYWORD_THEN; + } else if (strcmp(token.lexeme, "not") == 0) { + token.type = TOKEN_KEYWORD_NOT; + } else if (strcmp(token.lexeme, "via") == 0) { + token.type = TOKEN_KEYWORD_VIA; + } else if (strcmp(token.lexeme, "true") == 0) { + token.type = TOKEN_BOOLEAN; + token.literal.boolean = true; + } else if (strcmp(token.lexeme, "false") == 0) { + token.type = TOKEN_BOOLEAN; + token.literal.boolean = false; + } else { + token.type = TOKEN_IDENTIFIER; + } + + return token; +} + +/** + * @brief Read a special token (function reference, IO operations) + * + * @param lexer Lexer instance + * @return Token with special type + */ +static Token lexer_read_special(Lexer* lexer) { + Token token; + token.line = lexer->line; + token.column = lexer->column; + + if (lexer_peek(lexer) == '@') { + /* Function reference */ + lexer_advance(lexer); /* consume '@' */ + + /* Check if this is @(expression) syntax */ + if (!lexer_is_at_end(lexer) && lexer_peek(lexer) == '(') { + /* Just return the @ token for @(expression) syntax */ + token.type = TOKEN_FUNCTION_REF; + token.lexeme = malloc(2); /* +1 for '@' and '\0' */ + if (token.lexeme == NULL) { + lexer_set_error(lexer, "Memory allocation failed"); + token.type = TOKEN_EOF; + return token; + } + token.lexeme[0] = '@'; + token.lexeme[1] = '\0'; + } else { + /* Handle @function_name syntax */ + size_t start = lexer->position; + size_t length = 0; + + while (!lexer_is_at_end(lexer) && + (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { + lexer_advance(lexer); + length++; + } + + if (length == 0) { + lexer_set_error(lexer, "Invalid function reference"); + token.type = TOKEN_EOF; + return token; + } + + token.type = TOKEN_FUNCTION_REF; + token.lexeme = malloc(length + 2); /* +2 for '@' and '\0' */ + if (token.lexeme == NULL) { + lexer_set_error(lexer, "Memory allocation failed"); + token.type = TOKEN_EOF; + return token; + } + + token.lexeme[0] = '@'; + strncpy(token.lexeme + 1, lexer->source + start, length); + token.lexeme[length + 1] = '\0'; + } + + } else if (lexer_peek(lexer) == '.' && lexer_peek_next(lexer) == '.') { + /* IO operation */ + lexer_advance(lexer); /* consume first '.' */ + lexer_advance(lexer); /* consume second '.' */ + + size_t start = lexer->position; + size_t length = 0; + + while (!lexer_is_at_end(lexer) && + (isalpha(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { + lexer_advance(lexer); + length++; + } + + if (length == 0) { + lexer_set_error(lexer, "Invalid IO operation"); + token.type = TOKEN_EOF; + return token; + } + + token.lexeme = malloc(length + 3); /* +3 for '..', operation, and '\0' */ + if (token.lexeme == NULL) { + lexer_set_error(lexer, "Memory allocation failed"); + token.type = TOKEN_EOF; + return token; + } + + token.lexeme[0] = '.'; + token.lexeme[1] = '.'; + strncpy(token.lexeme + 2, lexer->source + start, length); + token.lexeme[length + 2] = '\0'; + + /* Determine IO operation type */ + if (strcmp(token.lexeme, "..in") == 0) { + token.type = TOKEN_IO_IN; + } else if (strcmp(token.lexeme, "..out") == 0) { + token.type = TOKEN_IO_OUT; + } else if (strcmp(token.lexeme, "..assert") == 0) { + token.type = TOKEN_IO_ASSERT; + } else { + lexer_set_error(lexer, "Unknown IO operation"); + token.type = TOKEN_EOF; + free(token.lexeme); + return token; + } + } + + return token; +} + +/** + * @brief Read the next token + * + * @param lexer Lexer instance + * @return Next token + */ +static Token lexer_next_token(Lexer* lexer) { + /* Skip whitespace and comments */ + while (!lexer_is_at_end(lexer)) { + lexer_skip_whitespace(lexer); + lexer_skip_comments(lexer); + + /* Check if we still have whitespace after comments */ + if (!lexer_is_at_end(lexer) && isspace(lexer_peek(lexer))) { + continue; + } + break; + } + + if (lexer_is_at_end(lexer)) { + Token token; + token.type = TOKEN_EOF; + token.lexeme = NULL; + token.line = lexer->line; + token.column = lexer->column; + return token; + } + + char c = lexer_peek(lexer); + + /* Numbers */ + if (isdigit(c)) { + return lexer_read_number(lexer); + } + + /* Strings */ + if (c == '"') { + return lexer_read_string(lexer); + } + + /* Special tokens */ + if (c == '@' || (c == '.' && lexer_peek_next(lexer) == '.')) { + return lexer_read_special(lexer); + } + + /* Identifiers and keywords */ + if (isalpha(c) || c == '_') { + return lexer_read_identifier(lexer); + } + + /* Single character tokens */ + switch (c) { + case '(': + lexer_advance(lexer); + return token_create(TOKEN_LPAREN, "(", lexer->line, lexer->column - 1); + case ')': + lexer_advance(lexer); + return token_create(TOKEN_RPAREN, ")", lexer->line, lexer->column - 1); + case '{': + lexer_advance(lexer); + return token_create(TOKEN_LBRACE, "{", lexer->line, lexer->column - 1); + case '}': + lexer_advance(lexer); + return token_create(TOKEN_RBRACE, "}", lexer->line, lexer->column - 1); + case '[': + lexer_advance(lexer); + return token_create(TOKEN_LBRACKET, "[", lexer->line, lexer->column - 1); + case ']': + lexer_advance(lexer); + return token_create(TOKEN_RBRACKET, "]", lexer->line, lexer->column - 1); + case ',': + lexer_advance(lexer); + return token_create(TOKEN_COMMA, ",", lexer->line, lexer->column - 1); + case ':': + lexer_advance(lexer); + return token_create(TOKEN_COLON, ":", lexer->line, lexer->column - 1); + case ';': + lexer_advance(lexer); + return token_create(TOKEN_SEMICOLON, ";", lexer->line, lexer->column - 1); + case '.': + lexer_advance(lexer); + return token_create(TOKEN_DOT, ".", lexer->line, lexer->column - 1); + case '-': + lexer_advance(lexer); + if (lexer_match(lexer, '>')) { + return token_create(TOKEN_ARROW, "->", lexer->line, lexer->column - 2); + } + /* For now, always treat minus as binary operator */ + /* TODO: Implement proper unary vs binary minus detection */ + return token_create(TOKEN_OP_MINUS, "-", lexer->line, lexer->column - 1); + case '+': + lexer_advance(lexer); + return token_create(TOKEN_OP_PLUS, "+", lexer->line, lexer->column - 1); + case '*': + lexer_advance(lexer); + return token_create(TOKEN_OP_MULTIPLY, "*", lexer->line, lexer->column - 1); + case '/': + lexer_advance(lexer); + return token_create(TOKEN_OP_DIVIDE, "/", lexer->line, lexer->column - 1); + case '%': + lexer_advance(lexer); + return token_create(TOKEN_OP_MODULO, "%", lexer->line, lexer->column - 1); + case '^': + lexer_advance(lexer); + return token_create(TOKEN_OP_POWER, "^", lexer->line, lexer->column - 1); + case '=': + lexer_advance(lexer); + if (lexer_match(lexer, '=')) { + return token_create(TOKEN_OP_EQUALS, "==", lexer->line, lexer->column - 2); + } + return token_create(TOKEN_OP_EQUALS, "=", lexer->line, lexer->column - 1); + case '!': + lexer_advance(lexer); + if (lexer_match(lexer, '=')) { + return token_create(TOKEN_OP_NOT_EQUALS, "!=", lexer->line, lexer->column - 2); + } + break; + case '<': + lexer_advance(lexer); + if (lexer_match(lexer, '=')) { + return token_create(TOKEN_OP_LESS_EQUAL, "<=", lexer->line, lexer->column - 2); + } + return token_create(TOKEN_OP_LESS, "<", lexer->line, lexer->column - 1); + case '>': + lexer_advance(lexer); + if (lexer_match(lexer, '=')) { + return token_create(TOKEN_OP_GREATER_EQUAL, ">=", lexer->line, lexer->column - 2); + } + return token_create(TOKEN_OP_GREATER, ">", lexer->line, lexer->column - 1); + } + + /* Unknown character */ + char error_msg[64]; + snprintf(error_msg, sizeof(error_msg), "Unexpected character: '%c'", c); + lexer_set_error(lexer, error_msg); + + Token token; + token.type = TOKEN_EOF; + token.lexeme = NULL; + token.line = lexer->line; + token.column = lexer->column; + return token; +} + +/* ============================================================================ + * Public Lexer API + * ============================================================================ */ + +/** + * @brief Tokenize source code + * + * @param source Source code to tokenize + * @param source_len Length of source code + * @param tokens Output array for tokens + * @param max_tokens Maximum number of tokens to read + * @return Number of tokens read, or -1 on error + */ +int baba_yaga_tokenize(const char* source, size_t source_len, + void** tokens, size_t max_tokens) { + if (source == NULL || tokens == NULL) { + return -1; + } + + Lexer* lexer = lexer_create(source, source_len); + if (lexer == NULL) { + return -1; + } + + size_t token_count = 0; + + while (token_count < max_tokens) { + Token token = lexer_next_token(lexer); + + if (lexer->has_error) { + lexer_destroy(lexer); + return -1; + } + + if (token.type == TOKEN_EOF) { + break; + } + + /* Allocate token and copy data */ + Token* token_ptr = malloc(sizeof(Token)); + if (token_ptr == NULL) { + lexer_destroy(lexer); + return -1; + } + + *token_ptr = token; + tokens[token_count] = token_ptr; + token_count++; + } + + lexer_destroy(lexer); + return (int)token_count; +} + +/** + * @brief Free tokens + * + * @param tokens Array of tokens + * @param count Number of tokens + */ +void baba_yaga_free_tokens(void** tokens, size_t count) { + if (tokens == NULL) { + return; + } + + for (size_t i = 0; i < count; i++) { + if (tokens[i] != NULL) { + Token* token = (Token*)tokens[i]; + if (token->lexeme != NULL) { + free(token->lexeme); + } + free(token); + } + } +} |