/** * @file lexer.c * @brief Lexer implementation for Baba Yaga * @author eli_oat * @version 0.0.1 * @date 2025 * * This file implements the lexical analyzer for the Baba Yaga language. */ #include #include #include #include #include #include "baba_yaga.h" /* ============================================================================ * Token Types * ============================================================================ */ typedef enum { /* End of file */ TOKEN_EOF, /* Literals */ TOKEN_NUMBER, TOKEN_STRING, TOKEN_BOOLEAN, /* Identifiers and keywords */ TOKEN_IDENTIFIER, TOKEN_KEYWORD_WHEN, TOKEN_KEYWORD_IS, TOKEN_KEYWORD_THEN, TOKEN_KEYWORD_AND, TOKEN_KEYWORD_OR, TOKEN_KEYWORD_XOR, TOKEN_KEYWORD_NOT, TOKEN_KEYWORD_VIA, /* Operators */ TOKEN_OP_PLUS, TOKEN_OP_MINUS, TOKEN_OP_UNARY_MINUS, TOKEN_OP_MULTIPLY, TOKEN_OP_DIVIDE, TOKEN_OP_MODULO, TOKEN_OP_POWER, TOKEN_OP_EQUALS, TOKEN_OP_NOT_EQUALS, TOKEN_OP_LESS, TOKEN_OP_LESS_EQUAL, TOKEN_OP_GREATER, TOKEN_OP_GREATER_EQUAL, /* Punctuation */ TOKEN_LPAREN, TOKEN_RPAREN, TOKEN_LBRACE, TOKEN_RBRACE, TOKEN_LBRACKET, TOKEN_RBRACKET, TOKEN_COMMA, TOKEN_COLON, TOKEN_SEMICOLON, TOKEN_ARROW, TOKEN_DOT, /* Special tokens */ TOKEN_FUNCTION_REF, /* @function */ TOKEN_IO_IN, /* ..in */ TOKEN_IO_OUT, /* ..out */ TOKEN_IO_ASSERT, /* ..assert */ TOKEN_IO_EMIT, /* ..emit */ TOKEN_IO_LISTEN /* ..listen */ } TokenType; /* ============================================================================ * Token Structure * ============================================================================ */ typedef struct { TokenType type; char* lexeme; int line; int column; union { double number; bool boolean; } literal; } Token; /* ============================================================================ * Lexer Structure * ============================================================================ */ typedef struct { const char* source; size_t source_len; size_t position; int line; int column; Token current_token; bool has_error; char* error_message; } Lexer; /* ============================================================================ * Token Helper Functions * ============================================================================ */ /** * @brief Create a simple token * * @param type Token type * @param lexeme Token lexeme * @param line Line number * @param column Column number * @return New token */ static Token token_create(TokenType type, const char* lexeme, int line, int column) { Token token; token.type = type; token.lexeme = lexeme != NULL ? strdup(lexeme) : NULL; token.line = line; token.column = column; token.literal.number = 0.0; /* Initialize union */ return token; } /* ============================================================================ * Lexer Functions * ============================================================================ */ /** * @brief Create a new lexer * * @param source Source code to tokenize * @param source_len Length of source code * @return New lexer instance, or NULL on failure */ static Lexer* lexer_create(const char* source, size_t source_len) { Lexer* lexer = malloc(sizeof(Lexer)); if (lexer == NULL) { return NULL; } lexer->source = source; lexer->source_len = source_len; lexer->position = 0; lexer->line = 1; lexer->column = 1; lexer->has_error = false; lexer->error_message = NULL; /* Initialize current token */ lexer->current_token.type = TOKEN_EOF; lexer->current_token.lexeme = NULL; lexer->current_token.line = 1; lexer->current_token.column = 1; return lexer; } /** * @brief Destroy a lexer * * @param lexer Lexer to destroy */ static void lexer_destroy(Lexer* lexer) { if (lexer == NULL) { return; } if (lexer->current_token.lexeme != NULL) { free(lexer->current_token.lexeme); } if (lexer->error_message != NULL) { free(lexer->error_message); } free(lexer); } /** * @brief Set lexer error * * @param lexer Lexer instance * @param message Error message */ static void lexer_set_error(Lexer* lexer, const char* message) { if (lexer == NULL) { return; } lexer->has_error = true; if (lexer->error_message != NULL) { free(lexer->error_message); } lexer->error_message = strdup(message); } /** * @brief Check if we're at the end of input * * @param lexer Lexer instance * @return true if at end, false otherwise */ static bool lexer_is_at_end(const Lexer* lexer) { return lexer->position >= lexer->source_len; } /** * @brief Peek at current character * * @param lexer Lexer instance * @return Current character, or '\0' if at end */ static char lexer_peek(const Lexer* lexer) { if (lexer_is_at_end(lexer)) { return '\0'; } return lexer->source[lexer->position]; } /** * @brief Peek at next character * * @param lexer Lexer instance * @return Next character, or '\0' if at end */ static char lexer_peek_next(const Lexer* lexer) { if (lexer->position + 1 >= lexer->source_len) { return '\0'; } return lexer->source[lexer->position + 1]; } /** * @brief Advance to next character * * @param lexer Lexer instance * @return Character that was advanced over */ static char lexer_advance(Lexer* lexer) { if (lexer_is_at_end(lexer)) { return '\0'; } char c = lexer->source[lexer->position]; lexer->position++; lexer->column++; if (c == '\n') { lexer->line++; lexer->column = 1; } return c; } /** * @brief Match current character and advance if it matches * * @param lexer Lexer instance * @param expected Expected character * @return true if matched, false otherwise */ static bool lexer_match(Lexer* lexer, char expected) { if (lexer_is_at_end(lexer)) { return false; } if (lexer->source[lexer->position] != expected) { return false; } lexer_advance(lexer); return true; } /** * @brief Skip whitespace * * @param lexer Lexer instance */ static void lexer_skip_whitespace(Lexer* lexer) { while (!lexer_is_at_end(lexer) && isspace(lexer_peek(lexer))) { lexer_advance(lexer); } } /** * @brief Skip comments * * @param lexer Lexer instance */ static void lexer_skip_comments(Lexer* lexer) { if (lexer_peek(lexer) == '/' && lexer_peek_next(lexer) == '/') { /* Single line comment */ while (!lexer_is_at_end(lexer) && lexer_peek(lexer) != '\n') { lexer_advance(lexer); } } else if (lexer_peek(lexer) == '/' && lexer_peek_next(lexer) == '*') { /* Multi-line comment */ lexer_advance(lexer); /* consume '/' */ lexer_advance(lexer); /* consume '*' */ while (!lexer_is_at_end(lexer)) { if (lexer_peek(lexer) == '*' && lexer_peek_next(lexer) == '/') { lexer_advance(lexer); /* consume '*' */ lexer_advance(lexer); /* consume '/' */ break; } lexer_advance(lexer); } } } /** * @brief Read a number literal * * @param lexer Lexer instance * @return Token with number literal */ static Token lexer_read_number(Lexer* lexer) { Token token; token.type = TOKEN_NUMBER; token.line = lexer->line; token.column = lexer->column; /* Read integer part */ while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { lexer_advance(lexer); } /* Read decimal part */ if (!lexer_is_at_end(lexer) && lexer_peek(lexer) == '.' && isdigit(lexer_peek_next(lexer))) { lexer_advance(lexer); /* consume '.' */ while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { lexer_advance(lexer); } } /* Read exponent part */ if (!lexer_is_at_end(lexer) && (lexer_peek(lexer) == 'e' || lexer_peek(lexer) == 'E')) { lexer_advance(lexer); /* consume 'e' or 'E' */ if (!lexer_is_at_end(lexer) && (lexer_peek(lexer) == '+' || lexer_peek(lexer) == '-')) { lexer_advance(lexer); /* consume sign */ } while (!lexer_is_at_end(lexer) && isdigit(lexer_peek(lexer))) { lexer_advance(lexer); } } /* Extract lexeme and convert to number */ size_t start = lexer->position - (lexer->column - token.column); size_t length = lexer->position - start; token.lexeme = malloc(length + 1); if (token.lexeme == NULL) { lexer_set_error(lexer, "Memory allocation failed"); token.type = TOKEN_EOF; return token; } strncpy(token.lexeme, lexer->source + start, length); token.lexeme[length] = '\0'; token.literal.number = atof(token.lexeme); return token; } /** * @brief Read a string literal * * @param lexer Lexer instance * @return Token with string literal */ static Token lexer_read_string(Lexer* lexer) { Token token; token.type = TOKEN_STRING; token.line = lexer->line; token.column = lexer->column; lexer_advance(lexer); /* consume opening quote */ size_t start = lexer->position; size_t length = 0; while (!lexer_is_at_end(lexer) && lexer_peek(lexer) != '"') { if (lexer_peek(lexer) == '\\' && !lexer_is_at_end(lexer)) { lexer_advance(lexer); /* consume backslash */ if (!lexer_is_at_end(lexer)) { lexer_advance(lexer); /* consume escaped character */ } } else { lexer_advance(lexer); } length++; } if (lexer_is_at_end(lexer)) { lexer_set_error(lexer, "Unterminated string literal"); token.type = TOKEN_EOF; return token; } lexer_advance(lexer); /* consume closing quote */ /* Extract lexeme */ token.lexeme = malloc(length + 1); if (token.lexeme == NULL) { lexer_set_error(lexer, "Memory allocation failed"); token.type = TOKEN_EOF; return token; } strncpy(token.lexeme, lexer->source + start, length); token.lexeme[length] = '\0'; return token; } /** * @brief Read an identifier or keyword * * @param lexer Lexer instance * @return Token with identifier or keyword */ static Token lexer_read_identifier(Lexer* lexer) { Token token; token.line = lexer->line; token.column = lexer->column; size_t start = lexer->position; size_t length = 0; while (!lexer_is_at_end(lexer) && (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { lexer_advance(lexer); length++; } /* Extract lexeme */ token.lexeme = malloc(length + 1); if (token.lexeme == NULL) { lexer_set_error(lexer, "Memory allocation failed"); token.type = TOKEN_EOF; return token; } strncpy(token.lexeme, lexer->source + start, length); token.lexeme[length] = '\0'; /* Check if it's a keyword */ if (strcmp(token.lexeme, "when") == 0) { token.type = TOKEN_KEYWORD_WHEN; } else if (strcmp(token.lexeme, "is") == 0) { token.type = TOKEN_KEYWORD_IS; } else if (strcmp(token.lexeme, "then") == 0) { token.type = TOKEN_KEYWORD_THEN; } else if (strcmp(token.lexeme, "not") == 0) { token.type = TOKEN_KEYWORD_NOT; } else if (strcmp(token.lexeme, "via") == 0) { token.type = TOKEN_KEYWORD_VIA; } else if (strcmp(token.lexeme, "true") == 0) { token.type = TOKEN_BOOLEAN; token.literal.boolean = true; } else if (strcmp(token.lexeme, "false") == 0) { token.type = TOKEN_BOOLEAN; token.literal.boolean = false; } else { token.type = TOKEN_IDENTIFIER; } return token; } /** * @brief Read a special token (function reference, IO operations) * * @param lexer Lexer instance * @return Token with special type */ static Token lexer_read_special(Lexer* lexer) { Token token; token.line = lexer->line; token.column = lexer->column; if (lexer_peek(lexer) == '@') { /* Function reference */ lexer_advance(lexer); /* consume '@' */ /* Check if this is @(expression) syntax */ if (!lexer_is_at_end(lexer) && lexer_peek(lexer) == '(') { /* Just return the @ token for @(expression) syntax */ token.type = TOKEN_FUNCTION_REF; token.lexeme = malloc(2); /* +1 for '@' and '\0' */ if (token.lexeme == NULL) { lexer_set_error(lexer, "Memory allocation failed"); token.type = TOKEN_EOF; return token; } token.lexeme[0] = '@'; token.lexeme[1] = '\0'; } else { /* Handle @function_name syntax */ size_t start = lexer->position; size_t length = 0; while (!lexer_is_at_end(lexer) && (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { lexer_advance(lexer); length++; } if (length == 0) { lexer_set_error(lexer, "Invalid function reference"); token.type = TOKEN_EOF; return token; } token.type = TOKEN_FUNCTION_REF; token.lexeme = malloc(length + 2); /* +2 for '@' and '\0' */ if (token.lexeme == NULL) { lexer_set_error(lexer, "Memory allocation failed"); token.type = TOKEN_EOF; return token; } token.lexeme[0] = '@'; strncpy(token.lexeme + 1, lexer->source + start, length); token.lexeme[length + 1] = '\0'; } } else if (lexer_peek(lexer) == '.' && lexer_peek_next(lexer) == '.') { /* IO operation */ lexer_advance(lexer); /* consume first '.' */ lexer_advance(lexer); /* consume second '.' */ size_t start = lexer->position; size_t length = 0; while (!lexer_is_at_end(lexer) && (isalpha(lexer_peek(lexer)) || lexer_peek(lexer) == '_')) { lexer_advance(lexer); length++; } if (length == 0) { lexer_set_error(lexer, "Invalid IO operation"); token.type = TOKEN_EOF; return token; } token.lexeme = malloc(length + 3); /* +3 for '..', operation, and '\0' */ if (token.lexeme == NULL) { lexer_set_error(lexer, "Memory allocation failed"); token.type = TOKEN_EOF; return token; } token.lexeme[0] = '.'; token.lexeme[1] = '.'; strncpy(token.lexeme + 2, lexer->source + start, length); token.lexeme[length + 2] = '\0'; /* Determine IO operation type */ if (strcmp(token.lexeme, "..in") == 0) { token.type = TOKEN_IO_IN; } else if (strcmp(token.lexeme, "..out") == 0) { token.type = TOKEN_IO_OUT; } else if (strcmp(token.lexeme, "..assert") == 0) { token.type = TOKEN_IO_ASSERT; } else if (strcmp(token.lexeme, "..emit") == 0) { token.type = TOKEN_IO_EMIT; } else if (strcmp(token.lexeme, "..listen") == 0) { token.type = TOKEN_IO_LISTEN; } else { lexer_set_error(lexer, "Unknown IO operation"); token.type = TOKEN_EOF; free(token.lexeme); return token; } } return token; } /** * @brief Read the next token * * @param lexer Lexer instance * @return Next token */ static Token lexer_next_token(Lexer* lexer) { /* Skip whitespace and comments */ while (!lexer_is_at_end(lexer)) { lexer_skip_whitespace(lexer); lexer_skip_comments(lexer); /* Check if we still have whitespace after comments */ if (!lexer_is_at_end(lexer) && isspace(lexer_peek(lexer))) { continue; } break; } if (lexer_is_at_end(lexer)) { Token token; token.type = TOKEN_EOF; token.lexeme = NULL; token.line = lexer->line; token.column = lexer->column; return token; } char c = lexer_peek(lexer); /* Numbers */ if (isdigit(c)) { return lexer_read_number(lexer); } /* Strings */ if (c == '"') { return lexer_read_string(lexer); } /* Special tokens */ if (c == '@' || (c == '.' && lexer_peek_next(lexer) == '.')) { return lexer_read_special(lexer); } /* Identifiers and keywords */ if (isalpha(c) || c == '_') { return lexer_read_identifier(lexer); } /* Single character tokens */ switch (c) { case '(': lexer_advance(lexer); return token_create(TOKEN_LPAREN, "(", lexer->line, lexer->column - 1); case ')': lexer_advance(lexer); return token_create(TOKEN_RPAREN, ")", lexer->line, lexer->column - 1); case '{': lexer_advance(lexer); return token_create(TOKEN_LBRACE, "{", lexer->line, lexer->column - 1); case '}': lexer_advance(lexer); return token_create(TOKEN_RBRACE, "}", lexer->line, lexer->column - 1); case '[': lexer_advance(lexer); return token_create(TOKEN_LBRACKET, "[", lexer->line, lexer->column - 1); case ']': lexer_advance(lexer); return token_create(TOKEN_RBRACKET, "]", lexer->line, lexer->column - 1); case ',': lexer_advance(lexer); return token_create(TOKEN_COMMA, ",", lexer->line, lexer->column - 1); case ':': lexer_advance(lexer); return token_create(TOKEN_COLON, ":", lexer->line, lexer->column - 1); case ';': lexer_advance(lexer); return token_create(TOKEN_SEMICOLON, ";", lexer->line, lexer->column - 1); case '.': lexer_advance(lexer); return token_create(TOKEN_DOT, ".", lexer->line, lexer->column - 1); case '-': lexer_advance(lexer); if (lexer_match(lexer, '>')) { return token_create(TOKEN_ARROW, "->", lexer->line, lexer->column - 2); } /* Check if this is a unary minus (followed by a digit, identifier, or parentheses) */ if ((lexer_peek(lexer) >= '0' && lexer_peek(lexer) <= '9') || (lexer_peek(lexer) >= 'a' && lexer_peek(lexer) <= 'z') || (lexer_peek(lexer) >= 'A' && lexer_peek(lexer) <= 'Z') || (lexer_peek(lexer) == '_') || (lexer_peek(lexer) == '(')) { return token_create(TOKEN_OP_UNARY_MINUS, "-", lexer->line, lexer->column - 1); } /* Otherwise treat as binary minus */ return token_create(TOKEN_OP_MINUS, "-", lexer->line, lexer->column - 1); case '+': lexer_advance(lexer); return token_create(TOKEN_OP_PLUS, "+", lexer->line, lexer->column - 1); case '*': lexer_advance(lexer); return token_create(TOKEN_OP_MULTIPLY, "*", lexer->line, lexer->column - 1); case '/': lexer_advance(lexer); return token_create(TOKEN_OP_DIVIDE, "/", lexer->line, lexer->column - 1); case '%': lexer_advance(lexer); return token_create(TOKEN_OP_MODULO, "%", lexer->line, lexer->column - 1); case '^': lexer_advance(lexer); return token_create(TOKEN_OP_POWER, "^", lexer->line, lexer->column - 1); case '=': lexer_advance(lexer); if (lexer_match(lexer, '=')) { return token_create(TOKEN_OP_EQUALS, "==", lexer->line, lexer->column - 2); } return token_create(TOKEN_OP_EQUALS, "=", lexer->line, lexer->column - 1); case '!': lexer_advance(lexer); if (lexer_match(lexer, '=')) { return token_create(TOKEN_OP_NOT_EQUALS, "!=", lexer->line, lexer->column - 2); } break; case '<': lexer_advance(lexer); if (lexer_match(lexer, '=')) { return token_create(TOKEN_OP_LESS_EQUAL, "<=", lexer->line, lexer->column - 2); } return token_create(TOKEN_OP_LESS, "<", lexer->line, lexer->column - 1); case '>': lexer_advance(lexer); if (lexer_match(lexer, '=')) { return token_create(TOKEN_OP_GREATER_EQUAL, ">=", lexer->line, lexer->column - 2); } return token_create(TOKEN_OP_GREATER, ">", lexer->line, lexer->column - 1); } /* Unknown character */ char error_msg[64]; snprintf(error_msg, sizeof(error_msg), "Unexpected character: '%c'", c); lexer_set_error(lexer, error_msg); Token token; token.type = TOKEN_EOF; token.lexeme = NULL; token.line = lexer->line; token.column = lexer->column; return token; } /* ============================================================================ * Public Lexer API * ============================================================================ */ /** * @brief Tokenize source code * * @param source Source code to tokenize * @param source_len Length of source code * @param tokens Output array for tokens * @param max_tokens Maximum number of tokens to read * @return Number of tokens read, or -1 on error */ int baba_yaga_tokenize(const char* source, size_t source_len, void** tokens, size_t max_tokens) { if (source == NULL || tokens == NULL) { return -1; } Lexer* lexer = lexer_create(source, source_len); if (lexer == NULL) { return -1; } size_t token_count = 0; while (token_count < max_tokens) { Token token = lexer_next_token(lexer); if (lexer->has_error) { lexer_destroy(lexer); return -1; } if (token.type == TOKEN_EOF) { break; } /* Allocate token and copy data */ Token* token_ptr = malloc(sizeof(Token)); if (token_ptr == NULL) { lexer_destroy(lexer); return -1; } *token_ptr = token; tokens[token_count] = token_ptr; token_count++; } lexer_destroy(lexer); return (int)token_count; } /** * @brief Free tokens * * @param tokens Array of tokens * @param count Number of tokens */ void baba_yaga_free_tokens(void** tokens, size_t count) { if (tokens == NULL) { return; } for (size_t i = 0; i < count; i++) { if (tokens[i] != NULL) { Token* token = (Token*)tokens[i]; if (token->lexeme != NULL) { free(token->lexeme); } free(token); } } }