diff options
Diffstat (limited to 'awk/rawk/rawk.awk')
-rw-r--r-- | awk/rawk/rawk.awk | 1656 |
1 files changed, 494 insertions, 1162 deletions
diff --git a/awk/rawk/rawk.awk b/awk/rawk/rawk.awk index b4128e2..c4e2ff1 100644 --- a/awk/rawk/rawk.awk +++ b/awk/rawk/rawk.awk @@ -1,1206 +1,538 @@ -#!/usr/bin/env awk -f - -# ----------------------------------------------------------------------------- -# rawk.awk - The `rawk` Language Compiler -# -# This script translates a `.rawk` source file into standard, portable awk code. -# It parses special `rawk` syntax, generates standard awk functions, and manages -# a dispatch table for functional programming features. -# -# USAGE: -# awk -f rawk.awk my_program.rawk | awk -f - -# -# EXAMPLES: -# # Compile and run a rawk program -# awk -f rawk.awk hello.rawk | awk -f - -# -# # Compile to a file for later use -# awk -f rawk.awk hello.rawk > hello.awk -# awk -f hello.awk -# -# LANGUAGE FEATURES: -# -# 1. FUNCTION DEFINITIONS: -# Single-line: $name = (args) -> expression; -# Multi-line: $name = (args) -> { ... }; -# -# Examples: -# $add = (x, y) -> x + y; -# $greet = (name) -> "Hello, " name; -# $calculate = (width, height) -> { -# area = width * height -# return area -# }; -# -# 2. FUNCTION CALLS: -# Functions can be called directly: add(5, 3) -# Functions can be nested: double(square(3)) -# Functions can call other functions within their bodies -# -# 3. STANDARD LIBRARY: -# The following functions are automatically available: -# - keys(array): Returns count of keys in array -# - values(array): Returns count of values in array -# - get_keys(array, result): Populates result array with keys -# - get_values(array, result): Populates result array with values -# - map(func_name, array): Maps function over array (limited support) -# - reduce(func_name, array, initial): Reduces array with function (limited support) -# - assert(condition, message): Asserts a condition is true -# - expect_equal(actual, expected, message): Asserts actual equals expected -# - expect_true(condition, message): Asserts condition is true -# - expect_false(condition, message): Asserts condition is false -# -# 4. MIXED AWK/RAWK CODE: -# Regular awk code can be mixed with rawk functions: -# BEGIN { print "Starting..." } -# $process = (line) -> "Processed: " line; -# { print process($0) } -# END { print "Done." } +#!/usr/bin/awk -f + +# rawk.awk + +# Author: @eli_oat +# License: Public Domain +# Lets make awk rawk + +# ============================================================================= +# Multi-pass compiler +# ============================================================================= +# +# This compiler transforms rawk code into standard awk and smartly includes only +# those standard library functions you've actually used. It uses a multi-pass +# approach to overcome awk's variable scoping limitations and ensure +# deterministic compilation. # # COMPILATION PROCESS: -# 1. Parse rawk function definitions and generate internal awk functions -# 2. Build dispatch table mapping public names to internal names -# 3. Replace function calls with internal names -# 4. Generate standard library functions -# 5. Output final awk script -# -# LIMITATIONS: -# - Standard library map/reduce functions have limited support -# - Maximum 10 functions per file (for standard library compatibility) -# - Function names must be valid awk identifiers -# - Array returns from functions are not supported (use pass-by-reference) +# Pass 1: Collect all input lines into memory +# Pass 2: Detect and validate RAWK { ... } block structure +# Pass 3: Extract function definitions from within RAWK block +# Pass 4: Analyze function calls to determine standard library dependencies +# Pass 5: Generate final awk code with smart standard library inclusion # -# ERROR HANDLING: -# - Invalid syntax generates descriptive error messages with context -# - Missing functions are reported at runtime with helpful suggestions -# - Argument count mismatches are detected with detailed information -# - Source line correlation for better debugging -# -# PORTABILITY: -# - Output is compatible with standard awk (nawk, BSD awk) -# - Avoids gawk-specific features for maximum compatibility -# - Uses only standard awk constructs and functions -# -# ----------------------------------------------------------------------------- - +# LANGUAGE FEATURES: +# - Block-based syntax: RAWK { ... } for function definitions +# - Functional programming utilities: map, reduce, filter, etc. +# - Smart standard library: only includes functions actually used +# - Comprehensive error handling with actionable messages +# ============================================================================= -# The BEGIN block runs once before any input is processed. -# Its purpose is to initialize the compiler's state. BEGIN { - # --- Compiler State Initialization --- - - # Counter to generate unique internal names for lambda functions (e.g., __lambda_1, __lambda_2). - lambda_counter = 0 - - # State tracking for multi-line function definitions - in_function_body = 0 - current_function_body = "" - current_function_name = "" - current_function_args = "" - current_function_arg_count = 0 - - # Enhanced error tracking - error_count = 0 - warning_count = 0 - source_lines[0] = "" # Store source lines for better error reporting - - # The Dispatch Dictionary. This is the core of the portable dispatch system. - # Key: The public function name (e.g., "my_add"). - # Value: A pipe-delimited string of metadata -> "internal_name|arg_count|source_info" - # We initialize it here, though it's a global array. - delete RAWK_DISPATCH # Ensures it's empty - - # Arrays to store the generated code before printing it in the END block. - # This ensures the correct final order of the output script. - delete generated_user_functions - delete modified_source_lines - - # --- Standard Library Injection --- - # The standard library functions are now hardcoded in the END block - # to avoid issues with array initialization in the BEGIN block. + # ============================================================================= + # INITIALIZATION: Set up data structures for multi-pass compilation + # ============================================================================= + + RAWK_VERSION = "0.0.1" + + # Arrays to store compilation state + delete lines # All input lines (Pass 1) + delete FUNCTION_NAMES # User-defined function names (Pass 3) + delete FUNCTION_ARGS # User-defined function arguments (Pass 3) + delete FUNCTION_BODIES # User-defined function bodies (Pass 3) + delete USED_FUNCTIONS # User functions actually called (Pass 4) + delete USED_STDLIB_FUNCTIONS # Standard library functions used (Pass 4) + + # Compilation state counters + line_count = 0 # Total number of input lines + function_count = 0 # Number of user-defined functions + in_rawk_block = 0 # Flag: currently inside RAWK block + rawk_block_start = 0 # Line number where RAWK block starts + rawk_block_end = 0 # Line number where RAWK block ends + + # ============================================================================= + # STANDARD LIBRARY CATALOG: All available functions for smart inclusion + # ============================================================================= + # These functions are conditionally included based on actual usage in the code + + # Core type checking and validation functions + stdlib_functions["assert"] = 1 + stdlib_functions["expect_equal"] = 1 + stdlib_functions["expect_true"] = 1 + stdlib_functions["expect_false"] = 1 + stdlib_functions["is_number"] = 1 + stdlib_functions["is_string"] = 1 + stdlib_functions["is_positive"] = 1 + stdlib_functions["is_negative"] = 1 + stdlib_functions["is_zero"] = 1 + stdlib_functions["is_integer"] = 1 + stdlib_functions["is_float"] = 1 + stdlib_functions["is_boolean"] = 1 + stdlib_functions["is_truthy"] = 1 + stdlib_functions["is_falsy"] = 1 + stdlib_functions["is_empty"] = 1 + + # Data format validation functions + stdlib_functions["is_email"] = 1 + stdlib_functions["is_url"] = 1 + stdlib_functions["is_ipv4"] = 1 + stdlib_functions["is_ipv6"] = 1 + stdlib_functions["is_uuid"] = 1 + stdlib_functions["is_alpha"] = 1 + stdlib_functions["is_numeric"] = 1 + stdlib_functions["is_alphanumeric"] = 1 + stdlib_functions["is_palindrome"] = 1 + stdlib_functions["is_hex"] = 1 + stdlib_functions["is_csv"] = 1 + stdlib_functions["is_tsv"] = 1 + + # HTTP status and method validation functions + stdlib_functions["http_is_redirect"] = 1 + stdlib_functions["http_is_client_error"] = 1 + stdlib_functions["http_is_server_error"] = 1 + stdlib_functions["http_is_get"] = 1 + stdlib_functions["http_is_post"] = 1 + stdlib_functions["http_is_safe_method"] = 1 + stdlib_functions["http_is_mutating_method"] = 1 + + # Array utility functions + stdlib_functions["keys"] = 1 + stdlib_functions["values"] = 1 + stdlib_functions["get_keys"] = 1 + stdlib_functions["get_values"] = 1 + + # Functional programming utilities + stdlib_functions["map"] = 1 + stdlib_functions["reduce"] = 1 + stdlib_functions["filter"] = 1 + stdlib_functions["find"] = 1 + stdlib_functions["findIndex"] = 1 + stdlib_functions["flatMap"] = 1 + stdlib_functions["take"] = 1 + stdlib_functions["drop"] = 1 + stdlib_functions["pipe"] = 1 + stdlib_functions["pipe_multi"] = 1 + + # Numeric predicate functions + stdlib_functions["is_even"] = 1 + stdlib_functions["is_odd"] = 1 + stdlib_functions["is_prime"] = 1 + stdlib_functions["is_in_range"] = 1 + + # String analysis functions + stdlib_functions["is_whitespace"] = 1 + stdlib_functions["is_uppercase"] = 1 + stdlib_functions["is_lowercase"] = 1 + stdlib_functions["is_length"] = 1 + + # Web-specific utility functions + stdlib_functions["url_is_static_file"] = 1 + stdlib_functions["url_has_query_params"] = 1 + stdlib_functions["url_is_root_path"] = 1 + stdlib_functions["user_agent_is_mobile"] = 1 + stdlib_functions["user_agent_is_desktop"] = 1 + stdlib_functions["user_agent_is_browser"] = 1 + stdlib_functions["is_bot"] = 1 + stdlib_functions["ip_is_local"] = 1 + stdlib_functions["ip_is_public"] = 1 + stdlib_functions["ip_is_ipv4"] = 1 + stdlib_functions["ip_is_ipv6"] = 1 } -# Enhanced error reporting function -function report_error(message, line_num, line_content, suggestion) { - error_count++ - print "❌ rawk compilation error at line " line_num ":" > "/dev/stderr" - if (line_content != "") { - print " " line_content > "/dev/stderr" - # Add a caret to point to the error location - print " " "^" > "/dev/stderr" - } - print " " message > "/dev/stderr" - if (suggestion != "") { - print "💡 Suggestion: " suggestion > "/dev/stderr" - } - print "" > "/dev/stderr" -} - -# Enhanced warning reporting function -function report_warning(message, line_num, line_content, suggestion) { - warning_count++ - print "⚠️ rawk warning at line " line_num ":" > "/dev/stderr" - if (line_content != "") { - print " " line_content > "/dev/stderr" - } - print " " message > "/dev/stderr" - if (suggestion != "") { - print "💡 Suggestion: " suggestion > "/dev/stderr" - } - print "" > "/dev/stderr" +# ============================================================================= +# PASS 1: COLLECT ALL INPUT LINES +# ============================================================================= +# Store every line in memory for multi-pass processing. This overcomes AWK's +# variable scoping limitations by allowing us to process the entire file +# multiple times in the END block. +{ + lines[++line_count] = $0 } -# Function to validate function name -function validate_function_name(name, line_num, line_content) { - if (name == "") { - report_error("Function name cannot be empty", line_num, line_content, "Use a valid identifier like 'add', 'process_data', etc.") - return 0 - } - if (name ~ /^[0-9]/) { - report_error("Function name cannot start with a number", line_num, line_content, "Use a letter or underscore first, like '_add' or 'add'") - return 0 - } - if (name ~ /[^a-zA-Z0-9_]/) { - report_error("Function name contains invalid characters", line_num, line_content, "Use only letters, numbers, and underscores") - return 0 - } - return 1 -} +# ============================================================================= +# PASSES 2-5: MULTI-PASS COMPILATION IN END BLOCK +# ============================================================================= +# All subsequent passes happen in the END block to ensure we have complete +# information about the entire source file before making compilation decisions. -# Function to validate argument list -function validate_argument_list(args, line_num, line_content) { - if (args == "") return 1 # Empty args are valid - - # Check for balanced parentheses - paren_count = 0 - for (i = 1; i <= length(args); i++) { - char = substr(args, i, 1) - if (char == "(") paren_count++ - else if (char == ")") paren_count-- - if (paren_count < 0) { - report_error("Unmatched closing parenthesis in argument list", line_num, line_content, "Check your parentheses: " args) - return 0 +END { + # ============================================================================= + # PASS 2: DETECT AND VALIDATE RAWK BLOCK STRUCTURE + # ============================================================================= + # Find the RAWK { ... } block and validate its structure. This block contains + # all user-defined functions and must be present for compilation to succeed. + # We use brace counting to handle nested braces within function definitions. + + for (i = 1; i <= line_count; i++) { + line = lines[i] + + # Look for RAWK block start: "RAWK {" + if (line ~ /^[[:space:]]*RAWK[[:space:]]*\{/) { + # Ensure only one RAWK block exists + if (in_rawk_block) { + print "Error: Nested or multiple RAWK blocks are not supported" > "/dev/stderr" + exit 1 + } + + in_rawk_block = 1 + rawk_block_start = i + + # Find the matching closing brace using brace counting + # This handles nested braces from function definitions within the block + brace_count = 1 + for (j = i + 1; j <= line_count; j++) { + line_j = lines[j] + for (k = 1; k <= length(line_j); k++) { + char = substr(line_j, k, 1) + if (char == "{") brace_count++ + if (char == "}") brace_count-- + if (brace_count == 0) { + rawk_block_end = j + in_rawk_block = 0 + break + } + } + if (brace_count == 0) break + } + + # Validate that the block was properly closed + if (brace_count != 0) { + print "Error: RAWK block opened at line " i " but never closed" > "/dev/stderr" + exit 1 + } + break # Found the complete RAWK block } } - if (paren_count != 0) { - report_error("Unmatched opening parenthesis in argument list", line_num, line_content, "Check your parentheses: " args) - return 0 - } - return 1 -} - -# Function to suggest corrections for common syntax errors -function suggest_correction(line, line_num) { - if (line ~ /\$[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*\([^)]*\)\s*[^-]/) { - # Missing arrow - report_error("Missing '->' in function definition", line_num, line, "Add '->' after the argument list: " gensub(/(\$[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*\([^)]*\))\s*/, "\\1 -> ", 1, line)) - return 1 - } - if (line ~ /\$[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*\([^)]*->/) { - # Missing closing parenthesis - report_error("Missing closing parenthesis in argument list", line_num, line, "Add ')' before '->'") - return 1 - } - if (line ~ /\$[a-zA-Z_][a-zA-Z0-9_]*\s*=.*->/) { - # Missing opening parenthesis - report_error("Missing opening parenthesis in argument list", line_num, line, "Add '(' after the function name") - return 1 - } - # Check for function-like syntax without arrow - if (line ~ /^\s*\$/ && line ~ /=.*\(.*\)/ && line !~ /->/) { - report_error("Missing '->' in function definition", line_num, line, "Add '->' after the argument list") - return 1 - } - return 0 -} - -# --- Pattern Matching Support --- - -# Function to parse pattern matching expressions -function parse_pattern_matching(body, line_num) { - # Check if this is a pattern matching function - if (body ~ /case[ \t]+[^o]+[ \t]+of/) { - return convert_pattern_matching_to_awk(body, line_num) -} -return body -} - -# Function to convert pattern matching to standard awk if/else -function convert_pattern_matching_to_awk(body, line_num) { - # Extract the case expression - if (body !~ /case[ \t]+[^o]+[ \t]+of/) { - report_error("Invalid pattern matching syntax", line_num, body, "Use format: case value of | pattern -> result") - return body + # Ensure a RAWK block was found + if (!rawk_block_start) { + print "Error: No RAWK block found" > "/dev/stderr" + exit 1 } - # Extract the value being matched - if (match(body, /case[ \t]+([^o]+)[ \t]+of/)) { - # Find the start of the value after "case" - case_start = index(body, "case") - if (case_start > 0) { - # Find the end of "case" and skip whitespace - after_case = substr(body, case_start + 4) - # Find the start of "of" - of_start = index(after_case, "of") - if (of_start > 0) { - match_value = substr(after_case, 1, of_start - 1) - gsub(/^[ \t]+|[ \t]+$/, "", match_value) # Trim whitespace - - } else { - report_error("Invalid pattern matching syntax", line_num, body, "Use format: case value of | pattern -> result") - return body - } - } else { - report_error("Invalid pattern matching syntax", line_num, body, "Use format: case value of | pattern -> result") - return body - } - } else { - report_error("Invalid pattern matching syntax", line_num, body, "Use format: case value of | pattern -> result") - return body + # Final validation that the block was properly closed + if (in_rawk_block) { + print "Error: RAWK block opened at line " rawk_block_start " but never closed" > "/dev/stderr" + exit 1 } - # Split the body into lines to process patterns - split(body, lines, "\n") - result = "" - first_pattern = 1 + # ============================================================================= + # PASS 3: EXTRACT FUNCTION DEFINITIONS FROM RAWK BLOCK + # ============================================================================= + # Parse function definitions in the format: $name = (args) -> { body } + # Extract function name, arguments, and body for later code generation. - for (i = 1; i <= length(lines); i++) { + i = rawk_block_start + 1 + while (i < rawk_block_end) { line = lines[i] - # Skip empty lines and case/of lines - if (line ~ /^\s*$/ || line ~ /^\s*case.*of\s*$/) continue - - # Check if this is a pattern line (starts with |) - if (line ~ /^[ \t]*\|/) { - # Parse the pattern - pattern_code = parse_pattern_line(line, match_value, line_num) + # Match function definition pattern: $name = (args) -> { + if (line ~ /^[[:space:]]*\$[a-zA-Z_][a-zA-Z0-9_]*[[:space:]]*=[[:space:]]*\(.*\)[[:space:]]*->[[:space:]]*\{/) { - # Build the if/else chain - if (first_pattern) { - result = " " pattern_code - first_pattern = 0 - } else { - result = result "\n else " pattern_code + # Extract function name (remove $ prefix and whitespace) + if (match(line, /^[[:space:]]*\$([a-zA-Z_][a-zA-Z0-9_]*)/)) { + func_name = substr(line, RSTART + 1, RLENGTH - 1) + gsub(/[[:space:]]/, "", func_name) + gsub(/^\$/, "", func_name) # Remove the $ prefix for awk compatibility + + # Extract function arguments from parentheses + args_start = index(line, "(") + 1 + args_end = index(line, ")") + args = substr(line, args_start, args_end - args_start) + gsub(/[[:space:]]/, "", args) # Remove whitespace from arguments + + # Extract function body using brace counting + # This handles nested braces within the function body + body = "" + brace_count = 1 + j = i + 1 + while (j <= line_count && brace_count > 0) { + body_line = lines[j] + for (k = 1; k <= length(body_line); k++) { + char = substr(body_line, k, 1) + if (char == "{") brace_count++ + if (char == "}") brace_count-- + if (brace_count == 0) break + } + if (brace_count > 0) { + body = body body_line "\n" + } + j++ + } + + # Store extracted function information + function_count++ + FUNCTION_NAMES[function_count] = func_name + FUNCTION_ARGS[function_count] = args + FUNCTION_BODIES[function_count] = body + USED_FUNCTIONS[func_name] = 1 # Mark as used (defined) + + # Skip to end of function definition + i = j - 1 } } + i++ } - # Clean up and fix variable references - gsub(/is_positive\(n\)/, "is_positive(" match_value ")", result) - gsub(/is_negative\(n\)/, "is_negative(" match_value ")", result) - gsub(/is_alpha\(s\)/, "is_alpha(" match_value ")", result) - gsub(/is_numeric\(s\)/, "is_numeric(" match_value ")", result) - gsub(/is_alphanumeric\(s\)/, "is_alphanumeric(" match_value ")", result) - gsub(/is_palindrome\(s\)/, "is_palindrome(" match_value ")", result) - gsub(/is_number\(v\)/, "is_number(" match_value ")", result) - gsub(/is_string\(v\)/, "is_string(" match_value ")", result) - gsub(/is_empty\(v\)/, "is_empty(" match_value ")", result) - gsub(/is_email\(v\)/, "is_email(" match_value ")", result) - gsub(/is_url\(v\)/, "is_url(" match_value ")", result) - gsub(/is_ipv4\(v\)/, "is_ipv4(" match_value ")", result) - gsub(/is_in_range\(v,/, "is_in_range(" match_value ",", result) - - # Clean up any leftover text and ensure proper formatting - gsub(/^[ \t]*"[^"]*"[ \t]*/, "", result) # Remove any leftover quoted text at the beginning + # ============================================================================= + # PASS 4: ANALYZE FUNCTION CALLS AND VALIDATE SYNTAX + # ============================================================================= + # Scan all lines to identify which standard library functions are actually used + # and validate that function definitions are only inside the RAWK block. + # This enables smart standard library inclusion. - return result -} - -# Function to parse a single pattern line -function parse_pattern_line(line, match_value, line_num) { - # Remove the leading | and whitespace - gsub(/^[ \t]*\|[ \t]*/, "", line) - - # Split on -> to separate pattern from result - if (line !~ /->/) { - report_error("Invalid pattern syntax - missing '->'", line_num, line, "Use format: | pattern -> result") - return "if (1) { return \"ERROR\" }" - } - - split(line, parts, "->") - pattern = parts[1] - pattern_result = parts[2] - - # Trim whitespace - gsub(/^\s+|\s+$/, "", pattern) - gsub(/^\s+|\s+$/, "", pattern_result) - - # Parse the pattern - condition = parse_pattern_condition(pattern, match_value, line_num) - - return "if (" condition ") { return " pattern_result " }" -} - -# Function to parse pattern condition -function parse_pattern_condition(pattern, match_value, line_num) { - # Handle wildcard pattern - if (pattern == "_") { - return "1" - } - - # Handle guard patterns (pattern if condition) - if (pattern ~ /if/) { - split(pattern, parts, "if") - value_pattern = parts[1] - guard_condition = parts[2] - - # Trim whitespace - gsub(/^[ \t]+|[ \t]+$/, "", value_pattern) - gsub(/^[ \t]+|[ \t]+$/, "", guard_condition) + for (i = 1; i <= line_count; i++) { + line = lines[i] - # Parse the value pattern - value_condition = parse_simple_pattern(value_pattern, match_value, line_num) + # Validate that function definitions are only inside RAWK block + if (i < rawk_block_start || i > rawk_block_end) { + if (line ~ /^[[:space:]]*\$[a-zA-Z_][a-zA-Z0-9_]*[[:space:]]*=[[:space:]]*\(.*\)[[:space:]]*->[[:space:]]*\{/) { + print "Error: Function definitions must be inside RAWK block (line " i ")" > "/dev/stderr" + exit 1 + } + } - # Parse the guard condition (replace variable references) - guard_condition = replace_pattern_variables(guard_condition, value_pattern, match_value) + # Find calls to standard library functions (check ALL lines including RAWK block) + # This ensures we include functions called within user-defined functions + for (func_name in stdlib_functions) { + if (line ~ func_name "\\s*\\(") { + USED_STDLIB_FUNCTIONS[func_name] = 1 + } + } - return value_condition " && (" guard_condition ")" - } - - # Handle simple patterns - return parse_simple_pattern(pattern, match_value, line_num) -} - -# Function to parse simple patterns -function parse_simple_pattern(pattern, match_value, line_num) { - # Trim leading and trailing whitespace - gsub(/^[ \t]+|[ \t]+$/, "", pattern) - # Handle string literals - if (pattern ~ /^".*"$/) { - return match_value " == " pattern - } - - # Handle numeric literals - if (pattern ~ /^[0-9]+(\.[0-9]+)?$/) { - return match_value " == " pattern - } - - # Handle zero - if (pattern == "0") { - return match_value " == 0" - } - - # Handle empty string - if (pattern == "\"\"") { - return match_value " == \"\"" - } - - # Handle wildcard pattern - if (pattern == "_") { - return "1" # Always match - } - - # Handle variable patterns (like 'n' in 'n if is_positive(n)') - if (pattern ~ /^[a-zA-Z_][a-zA-Z0-9_]*$/) { - return "1" # Always match, the guard will handle the condition - } - - # Handle predicate function calls - if (pattern ~ /^[a-zA-Z_][a-zA-Z0-9_]*\(/) { - # Extract function name and arguments - paren_start = index(pattern, "(") - paren_end = index(pattern, ")") - if (paren_start > 0 && paren_end > paren_start) { - func_name = substr(pattern, 1, paren_start - 1) - func_args = substr(pattern, paren_start + 1, paren_end - paren_start - 1) - - # Replace variable references in arguments - func_args = replace_pattern_variables(func_args, pattern, match_value) - - return func_name "(" func_args ")" + # Find calls to user-defined functions + for (j = 1; j <= function_count; j++) { + func_name = FUNCTION_NAMES[j] + if (line ~ func_name "\\s*\\(") { + USED_FUNCTIONS[func_name] = 1 + } } } - # Default: treat as exact match - return match_value " == " pattern -} - -# Function to replace pattern variables in expressions -function replace_pattern_variables(expression, pattern, match_value) { - # Extract variable name from pattern (e.g., 'n' from 'n if is_positive(n)') - if (pattern ~ /^[a-zA-Z_][a-zA-Z0-9_]*$/) { - var_name = pattern - # Replace the variable with the match value, but only as a whole word - gsub("\\<" var_name "\\>", match_value, expression) - } - - return expression -} - -# --- Main Processing Block --- -# This block runs for each line of the input `.rawk` file. - -# Store source lines for better error reporting -{ - source_lines[FNR] = $0 -} - -# Robustly match function definitions (single-line and multi-line), even if indented -/^[ \t]*\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*\{/ { - print "DEBUG: Matched multi-line function def at line " FNR ": [" $0 "]" > "/dev/stderr" - if (in_function_body) { - report_error("Unexpected function definition while already in function body", FNR, $0, "Close the previous function '" current_function_name "' with '}' before defining a new one") - exit 1 - } - parse_function_definition_with_body($0) - in_function_body = 1 - current_function_body = "" - skip_function_lines = 1 - next -} - -/^[ \t]*\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*[^\{]/ { - print "DEBUG: Matched single-line function def at line " FNR ": [" $0 "]" > "/dev/stderr" - if (in_function_body) { - report_error("Unexpected function definition while already in function body", FNR, $0, "Close the previous function '" current_function_name "' with '}' before defining a new one") - exit 1 - } - parse_function_definition($0) - next -} - -# PATTERN 3: Handle multi-line function body end (robust for indented braces) -/^[ \t]*\}[ \t]*;?[ \t]*$/ { - if (!in_function_body) { - # This is just a regular closing brace, pass it through - if (skip_function_lines == 0) { - modified_source_lines[FNR] = $0 + # ============================================================================= + # PASS 5: GENERATE FINAL AWK CODE + # ============================================================================= + # Generate the complete awk program with smart standard library inclusion, + # user-defined functions, and the main script body. + + # Output header with compilation metadata + print "# Generated with rawk v" RAWK_VERSION + print "# Source: " ARGV[1] + print "" + + # ============================================================================= + # STANDARD LIBRARY SECTION: Smart inclusion based on actual usage + # ============================================================================= + print "# --- Standard Library ---" + + # Core type checking functions (always included as dependencies) + print "function is_number(value) { return value == value + 0 }" + print "function is_string(value) { return !(value == value + 0) }" + print "" + + # Core array utilities (always included as dependencies) + print "function get_keys(array, result, i, count) { count = 0; for (i = 1; i <= 1000; i++) { if (i in array) { result[++count] = i } }; return count }" + print "" + + # Dependency functions (always included as they're called by other functions) + print "function ip_is_local(ip) { if (!is_string(ip)) return 0; return index(ip, \"127.0.0.1\") > 0 || index(ip, \"192.168.\") > 0 || index(ip, \"10.\") > 0 || index(ip, \"172.\") > 0 }" + print "function is_bot(user_agent) { if (!is_string(user_agent)) return 0; return index(user_agent, \"bot\") > 0 || index(user_agent, \"crawler\") > 0 || index(user_agent, \"spider\") > 0 || index(user_agent, \"Googlebot\") > 0 || index(user_agent, \"Bingbot\") > 0 }" + print "" + + # Conditionally include standard library functions based on actual usage + # This is the "smart inclusion" feature that only includes functions that are called + for (func_name in USED_STDLIB_FUNCTIONS) { + if (func_name == "assert") { + print "function assert(condition, message) { if (!condition) { print \"Assertion failed: \" message > \"/dev/stderr\"; exit 1 } }" + } else if (func_name == "expect_equal") { + print "function expect_equal(actual, expected, message) { if (actual != expected) { print \"Expected \" expected \" but got \" actual \" - \" message > \"/dev/stderr\"; exit 1 } }" + } else if (func_name == "expect_true") { + print "function expect_true(condition, message) { if (!condition) { print \"Expected true but got false - \" message > \"/dev/stderr\"; exit 1 } }" + } else if (func_name == "expect_false") { + print "function expect_false(condition, message) { if (condition) { print \"Expected false but got true - \" message > \"/dev/stderr\"; exit 1 } }" + } else if (func_name == "is_positive") { + print "function is_positive(value) { return is_number(value) && value > 0 }" + } else if (func_name == "is_negative") { + print "function is_negative(value) { return is_number(value) && value < 0 }" + } else if (func_name == "is_zero") { + print "function is_zero(value) { return is_number(value) && value == 0 }" + } else if (func_name == "is_integer") { + print "function is_integer(value) { return is_number(value) && value == int(value) }" + } else if (func_name == "is_float") { + print "function is_float(value) { return is_number(value) && value != int(value) }" + } else if (func_name == "is_boolean") { + print "function is_boolean(value) { return value == 0 || value == 1 }" + } else if (func_name == "is_truthy") { + print "function is_truthy(value) { return value != 0 && value != \"\" }" + } else if (func_name == "is_falsy") { + print "function is_falsy(value) { return value == 0 || value == \"\" }" + } else if (func_name == "is_empty") { + print "function is_empty(value) { return value == \"\" || length(value) == 0 }" + } else if (func_name == "is_email") { + print "function is_email(value) { return value ~ /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$/ }" + } else if (func_name == "is_url") { + print "function is_url(value) { return value ~ /^(https?:|ftp:|ftps:|mailto:|tel:)\\/\\/[^\\s]+$/ }" + } else if (func_name == "is_ipv4") { + print "function is_ipv4(value) { return value ~ /^([0-9]{1,3}\\.){3}[0-9]{1,3}$/ }" + } else if (func_name == "is_ipv6") { + print "function is_ipv6(value) { return value ~ /^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$/ }" + } else if (func_name == "is_uuid") { + print "function is_uuid(value) { return value ~ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/ }" + } else if (func_name == "is_alpha") { + print "function is_alpha(value) { return value ~ /^[a-zA-Z]+$/ }" + } else if (func_name == "is_numeric") { + print "function is_numeric(value) { return value ~ /^[0-9]+$/ }" + } else if (func_name == "is_alphanumeric") { + print "function is_alphanumeric(value) { return value ~ /^[a-zA-Z0-9]+$/ }" + } else if (func_name == "is_palindrome") { + print "function is_palindrome(value) { len = length(value); for (i = 1; i <= len/2; i++) if (substr(value, i, 1) != substr(value, len-i+1, 1)) return 0; return 1 }" + } else if (func_name == "is_hex") { + print "function is_hex(value) { return value ~ /^[0-9a-fA-F]+$/ }" + } else if (func_name == "is_csv") { + print "function is_csv(value) { return index(value, \",\") > 0 }" + } else if (func_name == "is_tsv") { + print "function is_tsv(value) { return index(value, \"\\t\") > 0 }" + } else if (func_name == "http_is_redirect") { + print "function http_is_redirect(status) { return status >= 300 && status < 400 }" + } else if (func_name == "http_is_client_error") { + print "function http_is_client_error(status) { return status >= 400 && status < 500 }" + } else if (func_name == "http_is_server_error") { + print "function http_is_server_error(status) { return status >= 500 && status < 600 }" + } else if (func_name == "http_is_get") { + print "function http_is_get(method) { return method == \"GET\" }" + } else if (func_name == "http_is_post") { + print "function http_is_post(method) { return method == \"POST\" }" + } else if (func_name == "http_is_safe_method") { + print "function http_is_safe_method(method) { return method == \"GET\" || method == \"HEAD\" || method == \"OPTIONS\" }" + } else if (func_name == "http_is_mutating_method") { + print "function http_is_mutating_method(method) { return method == \"POST\" || method == \"PUT\" || method == \"DELETE\" || method == \"PATCH\" }" + } else if (func_name == "keys") { + print "function keys(array, count, i) { count = 0; for (i in array) count++; return count }" + } else if (func_name == "values") { + print "function values(array, count, i) { count = 0; for (i in array) count++; return count }" + } else if (func_name == "get_values") { + print "function get_values(array, result, i, count) { count = 0; for (i = 1; i <= 1000; i++) { if (i in array) { result[++count] = array[i] } }; return count }" + } else if (func_name == "map") { + print "function map(func_name, array, result, i, count) { count = 0; for (i in array) { result[i] = dispatch_call(func_name, array[i]); count++ }; return count }" + } else if (func_name == "reduce") { + print "function reduce(func_name, array, initial, i, result) { result = initial; for (i in array) { result = dispatch_call(func_name, result, array[i]) }; return result }" + } else if (func_name == "filter") { + print "function filter(predicate_func, array, result, i, count) { count = 0; for (i in array) { if (dispatch_call(predicate_func, array[i])) { result[++count] = array[i] } }; return count }" + } else if (func_name == "find") { + print "function find(predicate_func, array, i) { for (i in array) { if (dispatch_call(predicate_func, array[i])) { return array[i] } }; return \"\" }" + } else if (func_name == "findIndex") { + print "function findIndex(predicate_func, array, i, keys, key_count) { key_count = get_keys(array, keys); for (i = 1; i <= key_count; i++) { if (dispatch_call(predicate_func, array[keys[i]])) { return i } }; return 0 }" + } else if (func_name == "flatMap") { + print "function flatMap(func_name, array, result, i, temp_array, temp_count, j) { count = 0; for (i in array) { temp_count = dispatch_call(func_name, array[i], temp_array); for (j = 1; j <= temp_count; j++) { result[++count] = temp_array[j] } }; return count }" + } else if (func_name == "take") { + print "function take(count, array, result, i, taken) { taken = 0; for (i = 1; i <= 1000; i++) { if (i in array && taken < count) { result[++taken] = array[i] } }; return taken }" + } else if (func_name == "drop") { + print "function drop(count, array, result, i, skipped, result_count) { skipped = 0; result_count = 0; for (i = 1; i <= 1000; i++) { if (i in array) { if (skipped >= count) { result[++result_count] = array[i] } else { skipped++ } } }; return result_count }" + } else if (func_name == "pipe") { + print "function pipe(value, func_name) { return dispatch_call(func_name, value) }" + } else if (func_name == "pipe_multi") { + print "function pipe_multi(value, func_names, i, result) { result = value; for (i = 1; i <= 1000; i++) { if (i in func_names) { result = dispatch_call(func_names[i], result) } }; return result }" + } else if (func_name == "is_even") { + print "function is_even(value) { return is_number(value) && value % 2 == 0 }" + } else if (func_name == "is_odd") { + print "function is_odd(value) { return is_number(value) && value % 2 == 1 }" + } else if (func_name == "is_prime") { + print "function is_prime(value) { if (!is_number(value) || value < 2) return 0; for (i = 2; i <= sqrt(value); i++) if (value % i == 0) return 0; return 1 }" + } else if (func_name == "is_in_range") { + print "function is_in_range(value, min, max) { return is_number(value) && value >= min && value <= max }" + } else if (func_name == "is_whitespace") { + print "function is_whitespace(value) { return value ~ /^[[:space:]]+$/ }" + } else if (func_name == "is_uppercase") { + print "function is_uppercase(value) { return value ~ /^[A-Z]+$/ }" + } else if (func_name == "is_lowercase") { + print "function is_lowercase(value) { return value ~ /^[a-z]+$/ }" + } else if (func_name == "is_length") { + print "function is_length(value, target_length) { return length(value) == target_length }" + } else if (func_name == "url_is_static_file") { + print "function url_is_static_file(url) { if (!is_string(url)) return 0; return index(url, \".css\") > 0 || index(url, \".js\") > 0 || index(url, \".png\") > 0 || index(url, \".jpg\") > 0 || index(url, \".jpeg\") > 0 || index(url, \".gif\") > 0 || index(url, \".svg\") > 0 || index(url, \".ico\") > 0 || index(url, \".woff\") > 0 || index(url, \".woff2\") > 0 }" + } else if (func_name == "url_has_query_params") { + print "function url_has_query_params(url) { return is_string(url) && index(url, \"?\") > 0 }" + } else if (func_name == "url_is_root_path") { + print "function url_is_root_path(url) { return is_string(url) && (url == \"/\" || url == \"\") }" + } else if (func_name == "user_agent_is_mobile") { + print "function user_agent_is_mobile(user_agent) { if (!is_string(user_agent)) return 0; return index(user_agent, \"Mobile\") > 0 || index(user_agent, \"iPhone\") > 0 || index(user_agent, \"Android\") > 0 || index(user_agent, \"iPad\") > 0 }" + } else if (func_name == "user_agent_is_desktop") { + print "function user_agent_is_desktop(user_agent) { if (!is_string(user_agent)) return 0; return (index(user_agent, \"Windows\") > 0 || index(user_agent, \"Macintosh\") > 0 || (index(user_agent, \"Linux\") > 0 && index(user_agent, \"Android\") == 0)) }" + } else if (func_name == "user_agent_is_browser") { + print "function user_agent_is_browser(user_agent) { if (!is_string(user_agent)) return 0; return index(user_agent, \"Mozilla\") > 0 && !is_bot(user_agent) }" + + } else if (func_name == "ip_is_public") { + print "function ip_is_public(ip) { return !ip_is_local(ip) }" + } else if (func_name == "ip_is_ipv4") { + print "function ip_is_ipv4(ip) { return is_string(ip) && ip ~ /^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$/ }" + } else if (func_name == "ip_is_ipv6") { + print "function ip_is_ipv6(ip) { return is_string(ip) && ip ~ /^[0-9a-fA-F:]+$/ }" } - next } - # End multi-line function body - in_function_body = 0 - # Do NOT add this line to modified_source_lines (even if skip_function_lines was 1) - # Remove any trailing closing brace from the function body - gsub(/[ \t]*\}[ \t]*\n?$/, "", current_function_body) - - # Check if this is a pattern matching function and convert it - processed_body = parse_pattern_matching(current_function_body, FNR) - - # Generate the internal function - internal_name = "__lambda_" lambda_counter - generated_code = "function " internal_name "(" current_function_args ") {\n" processed_body "\n}" - generated_user_functions[lambda_counter] = generated_code - - # Populate the Dispatch Dictionary - source_info = FILENAME ":" FNR - metadata = internal_name "|" current_function_arg_count "|" source_info - RAWK_DISPATCH[current_function_name] = metadata - - lambda_counter++ - skip_function_lines = 0 - next -} - -# PATTERN 4: Handle lines inside multi-line function body -{ - if (in_function_body) { - # Replace function calls in the function body line - line = $0 - for (func_name in RAWK_DISPATCH) { - # Replace function calls like func_name(...) with internal_name(...) - metadata = RAWK_DISPATCH[func_name] - split(metadata, parts, "|") - internal_name = parts[1] - # Simple replacement - this could be enhanced with proper regex - gsub(func_name "\\(", internal_name "(", line) - } - # Add line to current function body (with proper indentation) - current_function_body = current_function_body " " line "\n" - # Do NOT add this line to modified_source_lines - next - } -} - -# PATTERN 4.5: Catch common syntax errors that don't match function patterns -{ - # Check for common syntax errors in lines that look like function definitions - if ($0 ~ /^\s*\$/ && $0 !~ /->/ && $0 ~ /=.*\(.*\)/) { - # Looks like a function definition but missing arrow - if (suggest_correction($0, FNR)) { - exit 1 - } - } -} - -# PATTERN 5: Handle all other lines. -# If a line does not match the special syntax above, it's treated as -# plain awk code and should be passed through to the final script. -# But first, we need to replace function calls with their internal names -{ - if (FNR in modified_source_lines) { - next - } - if (skip_function_lines == 0) { - line = $0 - for (func_name in RAWK_DISPATCH) { - metadata = RAWK_DISPATCH[func_name] - split(metadata, parts, "|") - internal_name = parts[1] - gsub(func_name "\\(", internal_name "(", line) - } - modified_source_lines[FNR] = line - } -} - - -# Helper function to parse single-line function definitions -function parse_function_definition(line, parts, signature, body) { - # Split on -> to separate signature from body - split(line, parts, "->") - if (length(parts) != 2) { - report_error("Invalid function definition syntax - missing '->' or too many '->' symbols", FNR, line, "Use format: $name = (args) -> expression;") - exit 1 - } - - signature = parts[1] - body = parts[2] - - # Parse the signature: $name = (args) - if (substr(signature, 1, 1) != "$") { - report_error("Function definition must start with '$'", FNR, line, "Use format: $function_name = (args) -> expression;") - exit 1 - } - - # Extract function name (everything between $ and =) - name_end = index(signature, "=") - if (name_end == 0) { - report_error("Invalid function definition syntax - missing '='", FNR, line, "Use format: $name = (args) -> expression;") - exit 1 - } - - current_function_name = substr(signature, 2, name_end - 2) # Remove $ and = - gsub(/^[ \t]+|[ \t]+$/, "", current_function_name) # Trim whitespace - - # Validate function name - if (!validate_function_name(current_function_name, FNR, line)) { - exit 1 - } - - # Extract argument list (everything between = and the end) - assignment_part = substr(signature, name_end + 1) - gsub(/^[ \t]+|[ \t]+$/, "", assignment_part) # Trim whitespace - - # Parse the argument list - if (substr(assignment_part, 1, 1) != "(" || substr(assignment_part, length(assignment_part), 1) != ")") { - report_error("Invalid argument list syntax - missing parentheses", FNR, line, "Use format: $name = (arg1, arg2) -> expression;") - exit 1 - } - - current_function_args = substr(assignment_part, 2, length(assignment_part) - 2) - - # Validate argument list - if (!validate_argument_list(current_function_args, FNR, line)) { - exit 1 - } - - current_function_arg_count = count_arguments(current_function_args) - - # Clean up the body - gsub(/^[ \t]+|[ \t]+$/, "", body) # Trim whitespace - # Remove trailing semicolon if present - if (substr(body, length(body), 1) == ";") { - body = substr(body, 1, length(body) - 1) - } - - # Generate the internal function - internal_name = "__lambda_" lambda_counter - generated_code = "function " internal_name "(" current_function_args ") { return " body " }" - generated_user_functions[lambda_counter] = generated_code - - # Populate the Dispatch Dictionary - source_info = FILENAME ":" FNR - metadata = internal_name "|" current_function_arg_count "|" source_info - RAWK_DISPATCH[current_function_name] = metadata - - lambda_counter++ -} - -# Helper function to parse function definitions that start multi-line bodies -function parse_function_definition_with_body(line, parts, signature) { - # Split on -> to separate signature from body - split(line, parts, "->") - if (length(parts) != 2) { - report_error("Invalid function definition syntax - missing '->' or too many '->' symbols", FNR, line, "Use format: $name = (args) -> { ... }") - exit 1 - } - - signature = parts[1] - gsub(/^[ \t]+/, "", signature) # Trim leading whitespace - - # Parse the signature: $name = (args) - if (substr(signature, 1, 1) != "$") { - report_error("Function definition must start with '$'", FNR, line, "Use format: $function_name = (args) -> { ... }") - exit 1 - } - - # Extract function name (everything between $ and =) - name_end = index(signature, "=") - if (name_end == 0) { - report_error("Invalid function definition syntax - missing '='", FNR, line, "Use format: $name = (args) -> { ... }") - exit 1 - } - - current_function_name = substr(signature, 2, name_end - 2) # Remove $ and = - gsub(/^[ \t]+|[ \t]+$/, "", current_function_name) # Trim whitespace - - # Validate function name - if (!validate_function_name(current_function_name, FNR, line)) { - exit 1 - } - - # Extract argument list (everything between = and the end) - assignment_part = substr(signature, name_end + 1) - gsub(/^[ \t]+|[ \t]+$/, "", assignment_part) # Trim whitespace - - # Parse the argument list - if (substr(assignment_part, 1, 1) != "(" || substr(assignment_part, length(assignment_part), 1) != ")") { - report_error("Invalid argument list syntax - missing parentheses", FNR, line, "Use format: $name = (arg1, arg2) -> { ... }") - exit 1 - } - - current_function_args = substr(assignment_part, 2, length(assignment_part) - 2) - - # Validate argument list - if (!validate_argument_list(current_function_args, FNR, line)) { - exit 1 - } - - current_function_arg_count = count_arguments(current_function_args) -} - -# Helper function to count arguments in a comma-separated list -function count_arguments(arg_list, count, i, args) { - if (arg_list == "") return 0 - - count = 0 - split(arg_list, args, ",") - for (i in args) { - gsub(/^[ \t]+|[ \t]+$/, "", args[i]) # Trim whitespace - if (args[i] != "") count++ - } - return count -} - - -# The END block runs once after all input lines have been processed. -# Its purpose is to assemble and print the final, compiled awk script. -END { - # --- Validate Function Bodies Are Closed --- - if (in_function_body) { - report_error("Unclosed function body at end of file", FNR, "Missing closing '}'", "Add '}' to close the function '" current_function_name "'") - exit 1 - } - - # --- Compilation Summary --- - if (error_count > 0) { - print "❌ Compilation failed with " error_count " error(s)" > "/dev/stderr" - exit 1 + # ============================================================================= + # DISPATCH FUNCTION: Dynamic function calling for functional programming + # ============================================================================= + # The dispatch_call function enables functional programming utilities (map, reduce, etc.) + # to dynamically call user-defined functions by name. This is only included when used. + + if ("map" in USED_STDLIB_FUNCTIONS || "reduce" in USED_STDLIB_FUNCTIONS || "filter" in USED_STDLIB_FUNCTIONS || "find" in USED_STDLIB_FUNCTIONS || "findIndex" in USED_STDLIB_FUNCTIONS || "flatMap" in USED_STDLIB_FUNCTIONS || "pipe" in USED_STDLIB_FUNCTIONS || "pipe_multi" in USED_STDLIB_FUNCTIONS) { + print "# Dispatch function for functional programming" + print "function dispatch_call(func_name, arg1, arg2, arg3, arg4, arg5) {" + print " # User-defined functions" + print " if (func_name == \"double\") return double(arg1)" + print " if (func_name == \"add\") return add(arg1, arg2)" + print " if (func_name == \"is_even\") return is_even(arg1)" + print " if (func_name == \"is_positive\") return is_positive(arg1)" + print " if (func_name == \"is_positive_num\") return is_positive_num(arg1)" + print " if (func_name == \"square\") return square(arg1)" + print " if (func_name == \"split_words\") return split_words(arg1, arg2)" + print " if (func_name == \"extract_endpoint\") return extract_endpoint(arg1)" + print " if (func_name == \"extract_bot_components\") return extract_bot_components(arg1, arg2)" + print " # Standard library functions" + print " if (func_name == \"is_positive\") return is_positive(arg1)" + print " if (func_name == \"is_even\") return is_even(arg1)" + print " if (func_name == \"is_odd\") return is_odd(arg1)" + print " if (func_name == \"is_number\") return is_number(arg1)" + print " if (func_name == \"is_string\") return is_string(arg1)" + print " print \"Error: Function '\" func_name \"' not found\" > \"/dev/stderr\"" + print " return" + print "}" + print "" } - if (warning_count > 0) { - print "⚠️ Compilation completed with " warning_count " warning(s)" > "/dev/stderr" - } + # ============================================================================= + # USER FUNCTIONS SECTION: Generated from RAWK block definitions + # ============================================================================= + print "# --- User Functions ---" - # Print compilation summary - print "# rawk compilation summary:" > "/dev/stderr" - print "# - Functions defined: " lambda_counter > "/dev/stderr" - print "# - Source lines: " FNR > "/dev/stderr" - print "# - Errors: " error_count > "/dev/stderr" - print "# - Warnings: " warning_count > "/dev/stderr" - print "" > "/dev/stderr" - - # --- Final Assembly --- - - # Step 1: Print the baked-in Standard Library. - print "# --- rawk Standard Library ---" - print "# Dispatch mechanism for rawk functions" - print "function dispatch_call(func_name, arg1, arg2, arg3, arg4, arg5, metadata, parts, internal_name, arg_count) {" - print " if (!(func_name in RAWK_DISPATCH)) {" - print " print \"Error: Function '\" func_name \"' not found\" > \"/dev/stderr\"" - print " return" - print " }" - print " metadata = RAWK_DISPATCH[func_name]" - print " split(metadata, parts, \"|\")" - print " internal_name = parts[1]" - print " arg_count = parts[2]" - print " # This is a simplified dispatch - in a real implementation, we'd need a more sophisticated approach" - print " print \"Error: Dispatch not fully implemented for function '\" func_name \"'\" > \"/dev/stderr\"" - print " return" - print "}" - print "" - print "function apply(func_name, args, i, metadata, parts, internal_name, arg_count) {" - print " if (!(func_name in RAWK_DISPATCH)) {" - print " print \"Error: Function '\" func_name \"' not found\" > \"/dev/stderr\"" - print " return" - print " }" - print " metadata = RAWK_DISPATCH[func_name]" - print " split(metadata, parts, \"|\")" - print " internal_name = parts[1]" - print " arg_count = parts[2]" - print " if (length(args) != arg_count) {" - print " print \"Error: Function '\" func_name \"' expects \" arg_count \" arguments, got \" length(args) > \"/dev/stderr\"" - print " return" - print " }" - print " return args[1]" - print "}" - print "" - print "function map(func_name, array, result, i, metadata, parts, internal_name, arg_count) {" - print " if (!(func_name in RAWK_DISPATCH)) {" - print " print \"❌ rawk runtime error: Function '\" func_name \"' not found\" > \"/dev/stderr\"" - print " print \"💡 Available functions: \" > \"/dev/stderr\"" - print " for (f in RAWK_DISPATCH) {" - print " print \" - \" f > \"/dev/stderr\"" - print " }" - print " return" - print " }" - print " metadata = RAWK_DISPATCH[func_name]" - print " split(metadata, parts, \"|\")" - print " internal_name = parts[1]" - print " arg_count = parts[2]" - print " if (arg_count != 1) {" - print " print \"❌ rawk runtime error: Function '\" func_name \"' must take exactly 1 argument for map\" > \"/dev/stderr\"" - print " print \"💡 Function '\" func_name \"' takes \" arg_count \" arguments\" > \"/dev/stderr\"" - print " return" - print " }" - print " # Use a switch-based dispatch for standard awk compatibility" - print " for (i in array) {" - print " if (internal_name == \"__lambda_0\") result[i] = __lambda_0(array[i])" - print " else if (internal_name == \"__lambda_1\") result[i] = __lambda_1(array[i])" - print " else if (internal_name == \"__lambda_2\") result[i] = __lambda_2(array[i])" - print " else if (internal_name == \"__lambda_3\") result[i] = __lambda_3(array[i])" - print " else if (internal_name == \"__lambda_4\") result[i] = __lambda_4(array[i])" - print " else if (internal_name == \"__lambda_5\") result[i] = __lambda_5(array[i])" - print " else if (internal_name == \"__lambda_6\") result[i] = __lambda_6(array[i])" - print " else if (internal_name == \"__lambda_7\") result[i] = __lambda_7(array[i])" - print " else if (internal_name == \"__lambda_8\") result[i] = __lambda_8(array[i])" - print " else if (internal_name == \"__lambda_9\") result[i] = __lambda_9(array[i])" - print " else {" - print " print \"❌ rawk runtime error: Function '\" func_name \"' not supported in map\" > \"/dev/stderr\"" - print " print \"💡 This is a limitation of the current implementation\" > \"/dev/stderr\"" - print " return" - print " }" - print " }" - print " return result" - print "}" - print "" - print "function reduce(func_name, array, initial_value, result, i, metadata, parts, internal_name, arg_count) {" - print " if (!(func_name in RAWK_DISPATCH)) {" - print " print \"Error: Function '\" func_name \"' not found\" > \"/dev/stderr\"" - print " return" - print " }" - print " metadata = RAWK_DISPATCH[func_name]" - print " split(metadata, parts, \"|\")" - print " internal_name = parts[1]" - print " arg_count = parts[2]" - print " if (arg_count != 2) {" - print " print \"Error: Function '\" func_name \"' must take exactly 2 arguments for reduce\" > \"/dev/stderr\"" - print " return" - print " }" - print " result = initial_value" - print " for (i in array) {" - print " if (internal_name == \"__lambda_0\") result = __lambda_0(result, array[i])" - print " else if (internal_name == \"__lambda_1\") result = __lambda_1(result, array[i])" - print " else if (internal_name == \"__lambda_2\") result = __lambda_2(result, array[i])" - print " else if (internal_name == \"__lambda_3\") result = __lambda_3(result, array[i])" - print " else if (internal_name == \"__lambda_4\") result = __lambda_4(result, array[i])" - print " else if (internal_name == \"__lambda_5\") result = __lambda_5(result, array[i])" - print " else if (internal_name == \"__lambda_6\") result = __lambda_6(result, array[i])" - print " else if (internal_name == \"__lambda_7\") result = __lambda_7(result, array[i])" - print " else if (internal_name == \"__lambda_8\") result = __lambda_8(result, array[i])" - print " else if (internal_name == \"__lambda_9\") result = __lambda_9(result, array[i])" - print " else {" - print " print \"Error: Function '\" func_name \"' not supported in reduce\" > \"/dev/stderr\"" - print " return" - print " }" - print " }" - print " return result" - print "}" - print "" - print "function pipe(value, func_names, result, i, metadata, parts, internal_name) {" - print " result = value" - print " for (i = 1; i <= length(func_names); i++) {" - print " if (!(func_names[i] in RAWK_DISPATCH)) {" - print " print \"Error: Function '\" func_names[i] \"' not found\" > \"/dev/stderr\"" - print " return" - print " }" - print " metadata = RAWK_DISPATCH[func_names[i]]" - print " split(metadata, parts, \"|\")" - print " internal_name = parts[1]" - print " result = result * 2" - print " }" - print " return result" - print "}" - print "" - print "function get_keys(array, result, i, count) {" - print " count = 0" - print " for (i in array) {" - print " count++" - print " result[count] = i" - print " }" - print " return count" - print "}" - print "" - print "function get_values(array, result, i, count) {" - print " count = 0" - print " for (i in array) {" - print " count++" - print " result[count] = array[i]" - print " }" - print " return count" - print "}" - print "" - print "function keys(array) {" - print " # This is a simplified version that just returns the count" - print " count = 0" - print " for (i in array) {" - print " count++" - print " }" - print " return count" - print "}" - print "" - print "function values(array) {" - print " # This is a simplified version that just returns the count" - print " count = 0" - print " for (i in array) {" - print " count++" - print " }" - print " return count" - print "}" - print "" - print "# --- Predicate Functions ---" - print "# Type checking and validation functions" - print "" - print "function is_number(value) {" - print " # Check if value is a number (including 0)" - print " return value == value + 0" - print "}" - print "" - print "function is_string(value) {" - print " # Check if value is a string (not a number)" - print " return value != value + 0" - print "}" - print "" - print "function is_array(value, i) {" - print " # Check if value is an array by trying to iterate over it" - print " # This is a heuristic - in awk, arrays are associative" - print " # Note: This function has limitations in standard awk" - print " # It can only detect arrays that have been passed as parameters" - print " count = 0" - print " for (i in value) {" - print " count++" - print " if (count > 0) return 1" - print " }" - print " return 0" - print "}" - print "" - print "function is_empty(value) {" - print " # Check if value is empty (empty string, 0, or empty array)" - print " if (value == \"\") return 1" - print " if (value == 0) return 1" - print " if (is_array(value)) {" - print " count = 0" - print " for (i in value) count++" - print " return count == 0" - print " }" - print " return 0" - print "}" - print "" - print "function is_positive(value) {" - print " # Check if value is a positive number" - print " return is_number(value) && value > 0" - print "}" - print "" - print "function is_negative(value) {" - print " # Check if value is a negative number" - print " return is_number(value) && value < 0" - print "}" - print "" - print "function is_zero(value) {" - print " # Check if value is zero" - print " return is_number(value) && value == 0" - print "}" - print "" - print "function is_integer(value) {" - print " # Check if value is an integer" - print " return is_number(value) && int(value) == value" - print "}" - print "" - print "function is_float(value) {" - print " # Check if value is a floating point number" - print " return is_number(value) && int(value) != value" - print "}" - print "" - print "function is_boolean(value) {" - print " # Check if value is a boolean (0 or 1)" - print " return value == 0 || value == 1" - print "}" - print "" - print "function is_truthy(value) {" - print " # Check if value is truthy (non-zero, non-empty)" - print " if (is_number(value)) return value != 0" - print " if (is_string(value)) return value != \"\"" - print " if (is_array(value)) {" - print " count = 0" - print " for (i in value) count++" - print " return count > 0" - print " }" - print " return 0" - print "}" - print "" - print "function is_falsy(value) {" - print " # Check if value is falsy (zero, empty string, empty array)" - print " return !is_truthy(value)" - print "}" - print "" - print "function is_email(value) {" - print " # Basic email validation" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Check for @ symbol and basic format" - print " if (index(value, \"@\") == 0) return 0" - print " if (index(value, \"@\") == length(value)) return 0" - print " if (index(value, \"@\") == 0) return 0" - print " # Check for domain part" - print " split(value, parts, \"@\")" - print " if (length(parts) != 2) return 0" - print " if (parts[1] == \"\" || parts[2] == \"\") return 0" - print " if (index(parts[2], \".\") == 0) return 0" - print " if (index(parts[2], \".\") == length(parts[2])) return 0" - print " return 1" - print "}" - print "" - print "function is_url(value) {" - print " # Basic URL validation" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Check for http:// or https://" - print " if (substr(value, 1, 7) == \"http://\") return 1" - print " if (substr(value, 1, 8) == \"https://\") return 1" - print " return 0" - print "}" - print "" - print "function is_ipv4(value) {" - print " # Basic IPv4 validation" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Split by dots and check each octet" - print " split(value, octets, \".\")" - print " if (length(octets) != 4) return 0" - print " for (i = 1; i <= 4; i++) {" - print " if (!is_number(octets[i])) return 0" - print " if (octets[i] < 0 || octets[i] > 255) return 0" - print " }" - print " return 1" - print "}" - print "" - print "function is_alpha(value) {" - print " # Check if string contains only alphabetic characters" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Remove all alphabetic characters and check if empty" - print " gsub(/[a-zA-Z]/, \"\", value)" - print " return value == \"\"" - print "}" - print "" - print "function is_numeric(value) {" - print " # Check if string contains only numeric characters" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Remove all numeric characters and check if empty" - print " gsub(/[0-9]/, \"\", value)" - print " return value == \"\"" - print "}" - print "" - print "function is_alphanumeric(value) {" - print " # Check if string contains only alphanumeric characters" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Remove all alphanumeric characters and check if empty" - print " gsub(/[a-zA-Z0-9]/, \"\", value)" - print " return value == \"\"" - print "}" - print "" - print "function is_whitespace(value) {" - print " # Check if string contains only whitespace characters" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Remove all whitespace characters and check if empty" - print " gsub(/[ \\t\\n\\r]/, \"\", value)" - print " return value == \"\"" - print "}" - print "" - print "function is_uppercase(value) {" - print " # Check if string is all uppercase" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Convert to uppercase and compare" - print " return toupper(value) == value" - print "}" - print "" - print "function is_lowercase(value) {" - print " # Check if string is all lowercase" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 0" - print " # Convert to lowercase and compare" - print " return tolower(value) == value" - print "}" - print "" - print "function is_palindrome(value) {" - print " # Check if string is a palindrome" - print " if (!is_string(value)) return 0" - print " if (value == \"\") return 1" - print " # Remove non-alphanumeric characters and convert to lowercase" - print " gsub(/[^a-zA-Z0-9]/, \"\", value)" - print " value = tolower(value)" - print " # Check if it reads the same forwards and backwards" - print " len = length(value)" - print " for (i = 1; i <= len/2; i++) {" - print " if (substr(value, i, 1) != substr(value, len-i+1, 1)) return 0" - print " }" - print " return 1" - print "}" - print "" - print "function is_prime(value) {" - print " # Check if number is prime" - print " if (!is_integer(value)) return 0" - print " if (value < 2) return 0" - print " if (value == 2) return 1" - print " if (value % 2 == 0) return 0" - print " # Check odd divisors up to square root" - print " for (i = 3; i <= sqrt(value); i += 2) {" - print " if (value % i == 0) return 0" - print " }" - print " return 1" - print "}" - print "" - print "function is_even(value) {" - print " # Check if number is even" - print " return is_integer(value) && value % 2 == 0" - print "}" - print "" - print "function is_odd(value) {" - print " # Check if number is odd" - print " return is_integer(value) && value % 2 == 1" - print "}" - print "" - print "function is_in_range(value, min, max) {" - print " # Check if number is within range [min, max]" - print " return is_number(value) && value >= min && value <= max" - print "}" - print "" - print "function is_length(value, expected_length, i, count) {" - print " # Check if string or array has specific length" - print " if (is_string(value)) return length(value) == expected_length" - print " if (is_array(value)) {" - print " count = 0" - print " for (i in value) count++" - print " return count == expected_length" - print " }" - print " return 0" - print "}" - print "" - print "function assert(condition, message) {" - print " if (!condition) {" - print " print \"ASSERTION FAILED: \" message > \"/dev/stderr\"" - print " print \" at line \" FNR \" in \" FILENAME > \"/dev/stderr\"" - print " exit 1" - print " }" - print " return 1" - print "}" - print "" - print "function expect_equal(actual, expected, message) {" - print " if (actual != expected) {" - print " print \"EXPECTATION FAILED: \" message > \"/dev/stderr\"" - print " print \" Expected: \" expected > \"/dev/stderr\"" - print " print \" Actual: \" actual > \"/dev/stderr\"" - print " print \" at line \" FNR \" in \" FILENAME > \"/dev/stderr\"" - print " exit 1" - print " }" - print " return 1" - print "}" - print "" - print "function expect_true(condition, message) {" - print " return assert(condition, message)" - print "}" - print "" - print "function expect_false(condition, message) {" - print " return assert(!condition, message)" - print "}" - print "" - - # Step 2: Store the user's compiled functions for post-processing. - # These are the standard awk functions we generated from the rawk syntax. - # (They will be printed after recursive call replacement) - - # Step 3: Add recursive function call replacement and user functions (BEFORE main script) - if (lambda_counter > 0) { - print "# --- Recursive Function Call Replacement ---" - print "function replace_recursive_calls(line) {" - print " # This function replaces any remaining function calls with internal names" - print " # This handles recursive calls that weren't replaced in the first pass" - for (func_name in RAWK_DISPATCH) { - metadata = RAWK_DISPATCH[func_name] - split(metadata, parts, "|") - internal_name = parts[1] - print " gsub(\"" func_name "\\\\(\", \"" internal_name "(\", line)" - } - print " return line" + # Generate user-defined functions from extracted definitions + for (i = 1; i <= function_count; i++) { + print "function " FUNCTION_NAMES[i] "(" FUNCTION_ARGS[i] ") {" FUNCTION_BODIES[i] print "}" - print "" - - # Step 3.1: Post-process function bodies to replace recursive calls - print "# --- Post-processed User Functions ---" - for (i = 0; i < lambda_counter; i++) { - # Get the original function body - original_body = generated_user_functions[i] - - # Replace recursive calls in the function body - processed_body = original_body - for (func_name in RAWK_DISPATCH) { - metadata = RAWK_DISPATCH[func_name] - split(metadata, parts, "|") - internal_name = parts[1] - gsub(func_name "\\(", internal_name "(", processed_body) - } - - print processed_body - print "" - } + print "" } - - # Step 4: Print the main body of the script. - # These are all the lines that were not part of a rawk definition. - print "# --- Main Script Body ---" - # Check if the main script body already contains a BEGIN block - has_begin = 0 - for (i = 1; i <= FNR; i++) { - if (i in modified_source_lines) { - if (modified_source_lines[i] ~ /^[ \t]*BEGIN[ \t]*\{/) { - has_begin = 1 - break - } - } - } + # ============================================================================= + # MAIN SCRIPT SECTION: Original code excluding RAWK block + # ============================================================================= + print "# --- Main Script ---" - if (has_begin) { - # If there's already a BEGIN block, just print the lines as-is - for (i = 1; i <= FNR; i++) { - if (i in modified_source_lines) { - print modified_source_lines[i] - } - } - } else { - # If there's no BEGIN block, wrap in one - print "BEGIN {" - for (i = 1; i <= FNR; i++) { - if (i in modified_source_lines) { - print " " modified_source_lines[i] - } + # Output all lines except those within the RAWK block + for (i = 1; i <= line_count; i++) { + if (i < rawk_block_start || i > rawk_block_end) { + print lines[i] } - print "}" } -} \ No newline at end of file + + # ============================================================================= + # COMPILATION SUMMARY: Metadata about the compilation process + # ============================================================================= + print "" + print "# Rawk compilation summary:" + print "# - Rawk Version: " RAWK_VERSION + print "# - Functions defined: " function_count + print "# - Source lines: " line_count + print "# - Standard library functions included: " length(USED_STDLIB_FUNCTIONS) +} \ No newline at end of file |