#!/usr/bin/env awk -f # rawk.awk # Author: @eli_oat # License: Public Domain # Version: RAWK_VERSION = "0.0.1" # Lets help awk rawk # # This script translates a `.rawk` source file into standard, portable awk code. # It uses a two-stage compilation approach for robustness and simplicity. # # This script is implemented in awk, and should work with any POSIX awk. # # USAGE: # # Two-stage compilation (recommended) # awk -f rawk.awk my_program.rawk > my_program.awk # awk -f my_program.awk # # # One-step compilation and execution # awk -f rawk.awk my_program.rawk | awk -f - # # EXAMPLES: # # Basic usage - compile and run # awk -f rawk.awk hello.rawk | awk -f - # # # Compile to rawk to an awk file for later use # awk -f rawk.awk hello.rawk > hello.awk # awk -f hello.awk # # # Process input data # awk -f rawk.awk processor.rawk | awk -f - input.txt # # COMPILATION PROCESS: # 1. Parse rawk syntax and validate # 2. Generate standard AWK code # 3. Output generated code to stdout # 4. Output errors/warnings to stderr # 5. Exit with appropriate code (0=success, 1=error) # # ----------------------------------------------------------------------------- # LANGUAGE FEATURES # ----------------------------------------------------------------------------- # 1. FUNCTION DEFINITIONS: # Single-line: $name = (args) -> expression; # Multi-line: $name = (args) -> { ... }; # # SYNTAX RULES: # - Each function definition must be on its own line # - No code allowed after function definitions on the same line # - Single-line functions must end with semicolon # - Multi-line functions must not end with semicolon # # Examples: # $add = (x, y) -> x + y; # $greet = (name) -> "Hello, " name; # $calculate = (width, height) -> { # area = width * height # return area # }; # # ❌ Invalid (multiple functions on one line): # $add = (x, y) -> x + y; $multiply = (a, b) -> a * b; # # ❌ Invalid (code after function): # $add = (x, y) -> x + y; print "hello"; # # ❌ Invalid (missing semicolon): # $add = (x, y) -> x + y # # ❌ Invalid (extra semicolon): # $calculate = (w, h) -> { return w * h }; # # 2. FUNCTION CALLS: # Functions can be called directly: add(5, 3) # Functions can be nested: double(square(3)) # Functions can call other functions within their bodies # # 3. STANDARD LIBRARY: # # ARRAY UTILITIES: # - keys(array): Returns count of keys in array # - values(array): Returns count of values in array # - get_keys(array, result): Populates result array with keys # - get_values(array, result): Populates result array with values # # FUNCTIONAL PROGRAMMING: # - map(func_name, array, result): Apply function to each element of array # - reduce(func_name, array, initial): Reduce array using function (left fold) # - pipe(value, func_name): Pipe value through a single function # - pipe_multi(value, func_names): Pipe value through multiple functions # - dispatch_call(func_name, arg1, arg2, ...): Dynamic function dispatch # # ENHANCED ARRAY UTILITIES: # - filter(predicate_func, array, result): Filter array elements based on predicate # - find(predicate_func, array): Find first element that matches predicate # - findIndex(predicate_func, array): Find index of first element that matches predicate # - flatMap(func_name, array, result): Apply function to each element and flatten result # - take(count, array, result): Take first n elements from array # - drop(count, array, result): Drop first n elements from array # # TESTING FUNCTIONS: # - assert(condition, message): Asserts a condition is true # - expect_equal(actual, expected, message): Asserts actual equals expected # - expect_true(condition, message): Asserts condition is true # - expect_false(condition, message): Asserts condition is false # # PREDICATE FUNCTIONS: # - is_number(value), is_string(value), is_array(value) # - is_positive(value), is_negative(value), is_zero(value) # - is_integer(value), is_float(value), is_boolean(value) # - is_even(value), is_odd(value), is_prime(value) # - is_whitespace(value), is_uppercase(value), is_lowercase(value) # - is_email(value), is_url(value), is_ipv4(value), is_ipv6(value) # - is_uuid(value), is_hex(value), is_csv(value), is_tsv(value) # - is_palindrome(value), is_length(value, target_length) # - http_is_redirect(status), http_is_client_error(status), http_is_server_error(status) # - http_is_get(method), http_is_post(method), http_is_safe_method(method), http_is_mutating_method(method) # - url_is_static_file(url), url_has_query_params(url), url_is_root_path(url) # - user_agent_is_mobile(user_agent), user_agent_is_desktop(user_agent), user_agent_is_browser(user_agent) # - ip_is_local(ip), ip_is_public(ip), ip_is_ipv4(ip), ip_is_ipv6(ip) # # 4. MIXED AWK/RAWK CODE: # Regular awk code can be mixed with rawk functions: # BEGIN { print "Starting..." } # $process = (line) -> "Processed: " line; # { print process($0) } # END { print "Done." } # # ----------------------------------------------------------------------------- # ARCHITECTURE AND TECHNICAL MISCELLANY # ----------------------------------------------------------------------------- # 1. Parse: Extract rawk function definitions using `->` symbol # 2. Generate: Create internal awk functions with unique names (`__lambda_0`, etc.) # 3. Dispatch: Build dispatch table mapping public names to internal names # 4. Replace: Replace function calls with internal names in source code # 5. Output: Generate final awk script with standard library and user code # # GENERATED CODE STRUCTURE: # - Standard library functions (predicates, utilities, testing) # - Dispatch table (BEGIN block with RAWK_DISPATCH array) # - Internal function definitions (__lambda_0, __lambda_1, etc.) # - Main script body (user code with function calls replaced) # # LIMITATIONS: # - Function names must be valid awk identifiers # - Array returns from functions are not supported (use pass-by-reference) # - Array iteration order is not guaranteed (AWK limitation) # - Dynamic dispatch limited to functions defined at compile time # - Maximum 5 arguments per function (dispatch table limitation) # # ERROR HANDLING: # - Invalid syntax generates descriptive error messages with context # - Missing functions are reported at runtime with helpful suggestions # - Argument count mismatches are detected with detailed information # - Source line correlation for better debugging # # PORTABILITY: # - Output is compatible with standard awk (nawk, BSD awk) # - Avoids gawk-specific features # - Uses only standard awk constructs and functions # # ----------------------------------------------------------------------------- # Global state for multi-pass compilation BEGIN { # --- Compiler State Initialization --- # Function collection arrays delete FUNCTION_NAMES delete FUNCTION_ARGS delete FUNCTION_BODIES delete FUNCTION_TYPES # "single" or "multi" delete FUNCTION_LINES # source line numbers # Counters function_count = 0 line_count = 0 # State tracking in_function_body = 0 brace_count = 0 in_function_def = 0 # Track if we're in a function definition context # Source lines for pass 2 delete SOURCE_LINES delete SOURCE_LINE_TYPES # "function_def", "function_body", "code" # State tracking for multi-line function definitions in_function_body = 0 current_function_index = 0 # Enhanced error tracking error_count = 0 warning_count = 0 # Compilation statistics functions_defined = 0 source_lines = 0 errors = 0 warnings = 0 # Syntax validation state validation_mode = 0 # 0 = normal compilation, 1 = syntax validation only } # ----------------------------------------------------------------------------- # MAIN PROCESSING: Parse and collect function definitions # ----------------------------------------------------------------------------- { line_count++ # Skip comments and empty lines if ($0 ~ /^[ \t]*#/ || $0 ~ /^[ \t]*$/) { next } # Pattern: Multi-line function definition start (the only allowed form) if ($0 ~ /^[ \t]*\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*\{/) { in_function_def = 1 parse_multi_line_function($0, line_count) next # Do not add function definition line to main_script_lines } # Validate: Only allow function definitions with { ... } if ($0 ~ /^[ \t]*\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*[^\{]/) { report_validation_error("Function definitions must use braces: -> { ... }", line_count, $0, "Use: $name = (args) -> { ... }") next } # Pattern: Multi-line function body continuation if (in_function_body) { # Count opening and closing braces open_braces = gsub(/\{/, "&", $0) close_braces = gsub(/\}/, "&", $0) if (close_braces > 0 && brace_count <= 1) { # End of function body in_function_body = 0 in_function_def = 0 next } else { # Update brace count brace_count += open_braces - close_braces # Add line to current function body FUNCTION_BODIES[current_function_index] = FUNCTION_BODIES[current_function_index] "\n " $0 next } } # Pattern: Start of multi-line function body, but only if not already in a function body if (!in_function_body && in_function_def && $0 ~ /^[ \t]*\{/) { in_function_body = 1 brace_count = 1 next } # Pattern: Regular code - collect for main script if (!in_function_body && !($0 ~ /^[ \t]*\$/ && $0 ~ /->/)) { main_script_lines[++main_script_count] = $0 } # Unconditional next to suppress AWK's default printing next } # ----------------------------------------------------------------------------- # HELPER FUNCTIONS # ----------------------------------------------------------------------------- # First-pass syntax validation for each line function validate_line_syntax(line, line_num) { # Check for multiple functions on one line if (gsub(/\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->/, "FUNC") > 1) { report_validation_error("Multiple function definitions on one line", line_num, line, "Put each function on its own line") return } # Check for code after function definition on the same line if (line ~ /^\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*[^;{]*;[ \t]*[^ \t]/) { report_validation_error("Code after function definition on same line", line_num, line, "Put function definition on its own line") return } # Check for single-line functions missing semicolons if (line ~ /^\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*[^;{]*$/) { report_validation_error("Single-line function definition missing semicolon", line_num, line, "Add semicolon: " line ";") return } # Check for invalid function names if (line ~ /^\$[0-9]/) { report_validation_error("Function name cannot start with a number", line_num, line, "Use a letter or underscore: \$func_name = ...") return } # Check for missing arrow operator if (line ~ /^\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*[^-]/ && line !~ /->/) { report_validation_error("Function definition missing arrow operator (->)", line_num, line, "Add arrow: \$func = (args) -> expression") return } # Check for multi-line functions with semicolon after closing brace if (line ~ /^\$[a-zA-Z_][a-zA-Z0-9_]*[ \t]*=[ \t]*\([^)]*\)[ \t]*->[ \t]*\{[ \t]*\}[ \t]*;[ \t]*$/) { report_validation_error("Multi-line function should not end with semicolon", line_num, line, "Remove semicolon after closing brace") return } # Check for standard AWK function syntax if (line ~ /^function[ \t]+[a-zA-Z_][a-zA-Z0-9_]*[ \t]*\(/) { report_validation_warning("Standard AWK function syntax detected", line_num, line, "Use rawk syntax: \$func = (args) -> ...") return } } # Parse multi-line function definition function parse_multi_line_function(line, line_num) { # Extract function name if (match(line, /\$([a-zA-Z_][a-zA-Z0-9_]*)/)) { func_name = substr(line, RSTART + 1, RLENGTH - 1) } else { report_error("Invalid function name", line_num, line, "Function name must be a valid identifier") return } # Extract arguments if (match(line, /\(([^)]*)\)/)) { args = substr(line, RSTART + 1, RLENGTH - 2) } else { report_error("Invalid argument list", line_num, line, "Arguments must be enclosed in parentheses") return } # Store function information function_count++ current_function_index = function_count FUNCTION_NAMES[function_count] = func_name FUNCTION_ARGS[function_count] = args FUNCTION_BODIES[function_count] = "" FUNCTION_TYPES[function_count] = "multi" FUNCTION_LINES[function_count] = line_num # Start collecting function body (the opening brace is already on this line) in_function_body = 1 brace_count = 1 # Start with 1 for the opening brace functions_defined++ } # Parse single-line function definition function parse_single_line_function(line, line_num) { # Extract function name if (match(line, /\$([a-zA-Z_][a-zA-Z0-9_]*)/)) { func_name = substr(line, RSTART + 1, RLENGTH - 1) } else { report_error("Invalid function name", line_num, line, "Function name must be a valid identifier") return } # Extract arguments if (match(line, /\(([^)]*)\)/)) { args = substr(line, RSTART + 1, RLENGTH - 2) } else { report_error("Invalid argument list", line_num, line, "Arguments must be enclosed in parentheses") return } # Extract body. which we enforce as everything after -> until a semicolon if (match(line, /->[ \t]*(.+?);/)) { body = substr(line, RSTART + 2, RLENGTH - 3) # Remove -> and ; # Trim whitespace gsub(/^[ \t]+|[ \t]+$/, "", body) } else { report_error("Invalid function body", line_num, line, "Function body must follow '->' and end with ';'") return } # Store function information function_count++ FUNCTION_NAMES[function_count] = func_name FUNCTION_ARGS[function_count] = args FUNCTION_BODIES[function_count] = body FUNCTION_TYPES[function_count] = "single" FUNCTION_LINES[function_count] = line_num functions_defined++ } # Generate standard library functions # FIXME: in the future, we should only generate the functions that are actually used # TODO: track which functions are used/referenced function generate_standard_library() { print "# --- rawk Standard Library ---" print "# Dispatch mechanism for rawk functions" print "function dispatch_call(func_name, arg1, arg2, arg3, arg4, arg5, metadata, parts, internal_name, arg_count) {" print " if (!(func_name in RAWK_DISPATCH)) {" print " print \"Error: Function '\" func_name \"' not found\" > \"/dev/stderr\"" print " return" print " }" print " metadata = RAWK_DISPATCH[func_name]" print " split(metadata, parts, \"|\")" print " internal_name = parts[1]" print " arg_count = parts[2]" print " " print " # Switch statement dispatch based on internal function name" for (i = 1; i <= function_count; i++) { internal_name = "__lambda_" (i - 1) arg_count = split(FUNCTION_ARGS[i], args_array, ",") print " if (internal_name == \"" internal_name "\") {" if (arg_count == 0) { print " if (arg_count == 0) return " internal_name "()" } else if (arg_count == 1) { print " if (arg_count == 1) return " internal_name "(arg1)" } else if (arg_count == 2) { print " if (arg_count == 2) return " internal_name "(arg1, arg2)" } else if (arg_count == 3) { print " if (arg_count == 3) return " internal_name "(arg1, arg2, arg3)" } else if (arg_count == 4) { print " if (arg_count == 4) return " internal_name "(arg1, arg2, arg3, arg4)" } else if (arg_count == 5) { print " if (arg_count == 5) return " internal_name "(arg1, arg2, arg3, arg4, arg5)" } else { print " print \"Error: Function '\" func_name \"' has too many arguments (\" arg_count \")\" > \"/dev/stderr\"" print " return" } print " }" } print " " print " print \"Error: Invalid argument count for function '\" func_name \"'\" > \"/dev/stderr\"" print " return" print "}" print "" print "# --- Predicate Functions ---" print "# Type checking and validation functions" print "" print "function is_number(value) {" print " # Check if value is a number (including 0)" print " return value == value + 0" print "}" print "" print "function is_string(value) {" print " # Check if value is a string (not a number)" print " # In AWK, string numbers like \"123\" are both strings and numbers" print " # So we check if it's NOT a number to determine if it's a pure string" print " return !(value == value + 0)" print "}" print "" print "function assert(condition, message) {" print " if (!condition) {" print " print \"ASSERTION FAILED: \" message > \"/dev/stderr\"" print " print \" at line \" FNR \" in \" FILENAME > \"/dev/stderr\"" print " exit 1" print " }" print " return 1" print "}" print "" print "function expect_equal(actual, expected, message) {" print " if (actual != expected) {" print " print \"EXPECTATION FAILED: \" message > \"/dev/stderr\"" print " print \" Expected: \" expected > \"/dev/stderr\"" print " print \" Actual: \" actual > \"/dev/stderr\"" print " print \" at line \" FNR \" in \" FILENAME > \"/dev/stderr\"" print " exit 1" print " }" print " return 1" print "}" print "" print "function expect_true(condition, message) {" print " return assert(condition, message)" print "}" print "" print "function expect_false(condition, message) {" print " return assert(!condition, message)" print "}" print "" print "function is_positive(value) {" print " # Check if value is a positive number" print " return is_number(value) && value > 0" print "}" print "" print "function is_negative(value) {" print " # Check if value is a negative number" print " return is_number(value) && value < 0" print "}" print "" print "function is_zero(value) {" print " # Check if value is zero" print " return is_number(value) && value == 0" print "}" print "" print "function is_integer(value) {" print " # Check if value is an integer" print " return is_number(value) && int(value) == value" print "}" print "" print "function is_float(value) {" print " # Check if value is a floating point number" print " return is_number(value) && int(value) != value" print "}" print "" print "function is_boolean(value) {" print " # Check if value is a boolean (0 or 1)" print " return value == 0 || value == 1" print "}" print "" print "function is_truthy(value) {" print " # Check if value is truthy (non-zero, non-empty)" print " if (is_number(value)) return value != 0" print " if (is_string(value)) return value != \"\"" print " return 0" print "}" print "" print "function is_falsy(value) {" print " # Check if value is falsy (zero, empty string)" print " return !is_truthy(value)" print "}" print "" print "function is_empty(value) {" print " # Check if value is empty (empty string, 0)" print " if (value == \"\") return 1" print " if (value == 0) return 1" print " return 0" print "}" print "" print "function is_email(value) {" print " # Simple email validation" print " if (value == \"\") return 0" print " # Must contain exactly one @ symbol" print " at_count = 0" print " for (i = 1; i <= length(value); i++) {" print " if (substr(value, i, 1) == \"@\") at_count++" print " }" print " if (at_count != 1) return 0" print " # Split into local and domain parts" print " split(value, parts, \"@\")" print " local_part = parts[1]" print " domain_part = parts[2]" print " # Local and domain parts must not be empty" print " if (length(local_part) == 0 || length(domain_part) == 0) return 0" print " # Basic local part validation: no spaces" print " if (local_part ~ /[ ]/) return 0" print " # Domain part validation" print " if (index(domain_part, \".\") == 0) return 0" print " return 1" print "}" print "" print "function is_url(value) {" print " # Enhanced URL validation with multiple protocols" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Check for common URL schemes" print " if (value ~ /^(https|http|ftp|ftps|mailto|tel):(\\/\\/)?([a-zA-Z0-9\\.-]+)(:[0-9]+)?(\\/.*)?(\\?.*)?$/) {" print " # Extra check for http/https/ftp to ensure they have slashes" print " if ((value ~ /^http/ || value ~ /^ftp/) && value !~ /:\\/\\//) return 0" print " return 1" print " }" print " return 0" print "}" print "" print "function is_ipv4(value) {" print " # Basic IPv4 validation" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Split by dots and check each octet" print " split(value, octets, \".\")" print " if (length(octets) != 4) return 0" print " for (i = 1; i <= 4; i++) {" print " if (!is_number(octets[i])) return 0" print " if (octets[i] < 0 || octets[i] > 255) return 0" print " }" print " return 1" print "}" print "" print "function is_ipv6(value) {" print " # Enhanced IPv6 validation with interface identifiers" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Handle optional interface identifier (e.g., %eth0)" print " addr = value" print " if (index(addr, \"%\") > 0) {" print " split(addr, parts, \"%\")" print " addr = parts[1]" print " }" print " # An IPv6 address cannot contain more than one \"::\"" print " if (gsub(/::/, \"&\") > 1) return 0" print " # Check for invalid trailing colon" print " if (substr(addr, length(addr)) == \":\" && substr(addr, length(addr) - 1) != \"::\") return 0" print " has_trailing_colon = (substr(addr, length(addr) - 1) == \"::\")" print " num_parts = split(addr, parts, \":\")" print " empty_found = (addr ~ /::/)" print " total_segments = num_parts" print " if (has_trailing_colon) total_segments--" print " for (i = 1; i <= num_parts; i++) {" print " if (length(parts[i]) == 0) continue # Part of :: compression" print " # Each segment must be valid hex between 1 and 4 characters" print " if (parts[i] !~ /^[0-9a-fA-F]{1,4}$/) return 0" print " }" print " if (empty_found) {" print " if (total_segments > 7) return 0" print " } else {" print " if (total_segments != 8) return 0" print " }" print " return 1" print "}" print "" print "function is_uuid(value) {" print " # UUID validation (comprehensive format support)" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Pattern 1: Standard hyphenated UUID" print " if (value ~ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/) return 1" print " # Pattern 2: UUID with no hyphens (32 hex characters)" print " if (value ~ /^[0-9a-fA-F]{32}$/) return 1" print " # Pattern 3: URN-formatted UUID" print " if (value ~ /^urn:uuid:[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/) return 1" print " return 0" print "}" print "" print "function is_alpha(value) {" print " # Check if string contains only alphabetic characters" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Remove all alphabetic characters and check if empty" print " gsub(/[a-zA-Z]/, \"\", value)" print " return value == \"\"" print "}" print "" print "function is_numeric(value) {" print " # Check if string contains only numeric characters" print " if (value == \"\") return 0" print " # Convert to string and check if it contains only digits" print " str_value = value \"\"" print " # Remove all numeric characters and check if empty" print " gsub(/[0-9]/, \"\", str_value)" print " return str_value == \"\"" print "}" print "" print "function is_alphanumeric(value) {" print " # Check if string contains only alphanumeric characters" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Remove all alphanumeric characters and check if empty" print " gsub(/[a-zA-Z0-9]/, \"\", value)" print " return value == \"\"" print "}" print "" print "function is_palindrome(value) {" print " # Enhanced palindrome detection with better whitespace handling" print " if (!is_string(value)) return 0" print " if (value == \"\") return 1" print " # Clean string: lowercase and remove non-alphanumeric characters" print " clean_str = tolower(value)" print " gsub(/[^a-z0-9]/, \"\", clean_str)" print " len = length(clean_str)" print " if (len == 0) return 1 # Empty string after cleaning is a palindrome" print " # Check if it reads the same forwards and backwards" print " for (i = 1; i <= len / 2; i++) {" print " if (substr(clean_str, i, 1) != substr(clean_str, len - i + 1, 1)) return 0" print " }" print " return 1" print "}" print "" print "function is_in_range(value, min, max) {" print " # Check if number is within range [min, max]" print " return is_number(value) && value >= min && value <= max" print "}" print "" print "function is_even(value) {" print " # Check if number is even" print " return is_number(value) && value % 2 == 0" print "}" print "" print "function is_odd(value) {" print " # Check if number is odd" print " return is_number(value) && value % 2 != 0" print "}" print "" print "function is_prime(value) {" print " # Check if number is prime" print " if (!is_number(value) || value < 2) return 0" print " if (value == 2) return 1" print " if (value % 2 == 0) return 0" print " for (i = 3; i * i <= value; i += 2) {" print " if (value % i == 0) return 0" print " }" print " return 1" print "}" print "" print "function is_whitespace(value) {" print " # Check if string is whitespace" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " return value ~ /^[ \\t\\n\\r]+$/" print "}" print "" print "function is_uppercase(value) {" print " # Check if string is uppercase" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " return value ~ /^[A-Z]+$/" print "}" print "" print "function is_lowercase(value) {" print " # Check if string is lowercase" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " return value ~ /^[a-z]+$/" print "}" print "" print "function is_length(value, target_length) {" print " # Check if string/array has specific length" print " if (is_string(value)) {" print " return length(value) == target_length" print " } else {" print " # For arrays, count the elements" print " count = 0" print " for (i in value) count++" print " return count == target_length" print " }" print "}" print "" print "function is_array(value) {" print " # Check if value is an array (limited detection)" print " # This is a heuristic - we check if it has any elements" print " # Note: This function has limitations due to AWK's array handling" print " count = 0" print " for (i in value) {" print " count++" print " break # Just need to find one element" print " }" print " return count > 0" print "}" print "" print "function is_hex(value) {" print " # Enhanced hex validation with optional prefixes" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Strip optional prefixes" print " test_str = value" print " if (substr(test_str, 1, 2) == \"0x\" || substr(test_str, 1, 2) == \"0X\") {" print " test_str = substr(test_str, 3)" print " } else if (substr(test_str, 1, 1) == \"#\") {" print " test_str = substr(test_str, 2)" print " }" print " if (length(test_str) == 0) return 0 # Prefix only is not valid" print " return (test_str ~ /^[0-9a-fA-F]+$/) ? 1 : 0" print "}" print "" print "function is_csv(value, _fs_orig, _nf_orig, _comma_count, _quote_count) {" print " # Check if string appears to be CSV format (robust version)" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Heuristic 1: Must contain at least one comma" print " if (index(value, \",\") == 0) return 0" print " # Heuristic 2: Should have an even number of double quotes" print " _quote_count = gsub(/\"/, \"&\", value)" print " if (_quote_count % 2 != 0) return 0" print " # Heuristic 3: When split by comma, should result in more than one field" print " _fs_orig = FS" print " _nf_orig = NF" print " FS = \",\"" print " $0 = value" print " _comma_count = NF" print " # Restore original state" print " FS = _fs_orig" print " $0 = $0" print " return (_comma_count > 1) ? 1 : 0" print "}" print "" print "function is_tsv(value, _fs_orig, _nf_orig, _tab_count) {" print " # Check if string appears to be TSV format (robust version)" print " if (!is_string(value)) return 0" print " if (value == \"\") return 0" print " # Heuristic 1: Must contain at least one tab character" print " if (index(value, \"\\t\") == 0) return 0" print " # Heuristic 2: When split by tab, should result in more than one field" print " _fs_orig = FS" print " _nf_orig = NF" print " FS = \"\\t\"" print " $0 = value" print " _tab_count = NF" print " # Restore original state" print " FS = _fs_orig" print " $0 = $0" print " return (_tab_count > 1) ? 1 : 0" print "}" print "" print "# --- HTTP Status Code Predicates ---" print "function http_is_redirect(status) {" print " # Check if HTTP status code indicates a redirect (3xx)" print " return is_number(status) && status >= 300 && status < 400" print "}" print "" print "function http_is_client_error(status) {" print " # Check if HTTP status code indicates a client error (4xx)" print " return is_number(status) && status >= 400 && status < 500" print "}" print "" print "function http_is_server_error(status) {" print " # Check if HTTP status code indicates a server error (5xx)" print " return is_number(status) && status >= 500 && status < 600" print "}" print "" print "# --- HTTP Method Predicates ---" print "function http_is_get(method) {" print " # Check if HTTP method is GET" print " return is_string(method) && method == \"GET\"" print "}" print "" print "function http_is_post(method) {" print " # Check if HTTP method is POST" print " return is_string(method) && method == \"POST\"" print "}" print "" print "function http_is_safe_method(method) {" print " # Check if HTTP method is safe (GET, HEAD)" print " return is_string(method) && (method == \"GET\" || method == \"HEAD\")" print "}" print "" print "function http_is_mutating_method(method) {" print " # Check if HTTP method can mutate server state (POST, PUT, DELETE, PATCH)" print " return is_string(method) && (method == \"POST\" || method == \"PUT\" || method == \"DELETE\" || method == \"PATCH\")" print "}" print "" print "# --- URL/Path Predicates ---" print "function url_is_static_file(url) {" print " # Check if URL points to a static file (CSS, JS, images, etc.)" print " if (!is_string(url)) return 0" print " return index(url, \".css\") > 0 || index(url, \".js\") > 0 || index(url, \".png\") > 0 || index(url, \".jpg\") > 0 || index(url, \".jpeg\") > 0 || index(url, \".gif\") > 0 || index(url, \".svg\") > 0 || index(url, \".ico\") > 0 || index(url, \".woff\") > 0 || index(url, \".woff2\") > 0" print "}" print "" print "function url_has_query_params(url) {" print " # Check if URL contains query parameters" print " return is_string(url) && index(url, \"?\") > 0" print "}" print "" print "function url_is_root_path(url) {" print " # Check if URL is the root path" print " return is_string(url) && (url == \"/\" || url == \"\")" print "}" print "" print "# --- User Agent Predicates ---" print "function user_agent_is_mobile(user_agent) {" print " # Check if user agent indicates a mobile device" print " if (!is_string(user_agent)) return 0" print " return index(user_agent, \"Mobile\") > 0 || index(user_agent, \"iPhone\") > 0 || index(user_agent, \"Android\") > 0 || index(user_agent, \"iPad\") > 0" print "}" print "" print "function user_agent_is_desktop(user_agent) {" print " # Check if user agent indicates a desktop device" print " if (!is_string(user_agent)) return 0" print " # Check for desktop OS indicators, but exclude mobile Linux (Android)" print " return (index(user_agent, \"Windows\") > 0 || index(user_agent, \"Macintosh\") > 0 || (index(user_agent, \"Linux\") > 0 && index(user_agent, \"Android\") == 0))" print "}" print "" print "function is_bot(user_agent) {" print " # Check if user agent indicates a bot/crawler" print " if (!is_string(user_agent)) return 0" print " return index(user_agent, \"bot\") > 0 || index(user_agent, \"crawler\") > 0 || index(user_agent, \"spider\") > 0 || index(user_agent, \"Googlebot\") > 0 || index(user_agent, \"Bingbot\") > 0" print "}" print "" print "function user_agent_is_browser(user_agent) {" print " # Check if user agent indicates a web browser (not a bot)" print " if (!is_string(user_agent)) return 0" print " return index(user_agent, \"Mozilla\") > 0 && !is_bot(user_agent)" print "}" print "" print "# --- IP Address Predicates ---" print "function ip_is_local(ip) {" print " # Check if IP address is local/private" print " if (!is_string(ip)) return 0" print " return index(ip, \"127.0.0.1\") > 0 || index(ip, \"192.168.\") > 0 || index(ip, \"10.\") > 0 || index(ip, \"172.\") > 0" print "}" print "" print "function ip_is_public(ip) {" print " # Check if IP address is public (not local)" print " return !ip_is_local(ip)" print "}" print "" print "function ip_is_ipv4(ip) {" print " # Check if IP address is IPv4 format" print " return is_string(ip) && ip ~ /^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$/" print "}" print "" print "function ip_is_ipv6(ip) {" print " # Check if IP address is IPv6 format" print " return is_string(ip) && ip ~ /^[0-9a-fA-F:]+$/" print "}" print "" print "# --- Array Utility Functions ---" print "" print "function keys(array, count, i) {" print " # Returns count of keys in array" print " count = 0" print " for (i in array) count++" print " return count" print "}" print "" print "function values(array, count, i) {" print " # Returns count of values in array" print " count = 0" print " for (i in array) count++" print " return count" print "}" print "" print "function get_keys(array, result, i, count) {" print " # Populates result array with keys" print " count = 0" print " for (i in array) {" print " result[++count] = i" print " }" print " return count" print "}" print "" print "function get_values(array, result, i, count) {" print " # Populates result array with values" print " count = 0" print " for (i in array) {" print " result[++count] = array[i]" print " }" print " return count" print "}" print "" print "# --- Functional Programming Functions ---" print "" print "function map(func_name, array, result, i) {" print " # Apply function to each element of array, preserving indices" print " for (i in array) {" print " result[i] = dispatch_call(func_name, array[i])" print " }" print " return keys(array)" print "}" print "" print "function reduce(func_name, array, initial, result, i, first) {" print " # Reduce array using function (left fold)" print " result = initial" print " first = 1" print " for (i in array) {" print " if (first) {" print " result = array[i]" print " first = 0" print " } else {" print " result = dispatch_call(func_name, result, array[i])" print " }" print " }" print " return result" print "}" print "" print "function pipe(value, func_name, result) {" print " # Pipe value through a single function (simplified version)" print " result = dispatch_call(func_name, value)" print " return result" print "}" print "" print "function pipe_multi(value, func_names, result, i, func_count) {" print " # Pipe value through multiple functions (func_names is array)" print " result = value" print " func_count = length(func_names)" print " for (i = 1; i <= func_count; i++) {" print " result = dispatch_call(func_names[i], result)" print " }" print " return result" print "}" print "" print "# --- Enhanced Array Utilities ---" print "" print "function filter(predicate_func, array, result, i, count) {" print " # Filter array elements based on predicate function" print " count = 0" print " for (i in array) {" print " if (dispatch_call(predicate_func, array[i])) {" print " result[++count] = array[i]" print " }" print " }" print " return count" print "}" print "" print "function find(predicate_func, array, i, keys, key_count) {" print " # Find first element that matches predicate" print " key_count = get_keys(array, keys)" print " for (i = 1; i <= key_count; i++) {" print " if (dispatch_call(predicate_func, array[keys[i]])) {" print " return array[keys[i]]" print " }" print " }" print " return \"\" # Not found" print "}" print "" print "function findIndex(predicate_func, array, i, keys, key_count) {" print " # Find index of first element that matches predicate" print " key_count = get_keys(array, keys)" print " for (i = 1; i <= key_count; i++) {" print " if (dispatch_call(predicate_func, array[keys[i]])) {" print " return i" print " }" print " }" print " return 0 # Not found" print "}" print "" print "function flatMap(func_name, array, result, i, temp_array, temp_count, j) {" print " # Apply function to each element and flatten the result" print " for (i in array) {" print " temp_count = dispatch_call(func_name, array[i], temp_array)" print " for (j = 1; j <= temp_count; j++) {" print " result[keys(result) + 1] = temp_array[j]" print " }" print " }" print " return keys(result)" print "}" print "" print "function take(count, array, result, i, count_taken) {" print " # Take first n elements from array" print " count_taken = 0" print " for (i in array) {" print " if (count_taken >= count) break" print " count_taken++" print " result[count_taken] = array[i]" print " }" print " return count_taken" print "}" print "" print "function drop(count, array, result, i, count_dropped, count_kept) {" print " # Drop first n elements from array" print " count_dropped = 0" print " count_kept = 0" print " for (i in array) {" print " count_dropped++" print " if (count_dropped > count) {" print " count_kept++" print " result[count_kept] = array[i]" print " }" print " }" print " return count_kept" print "}" print "" } # Generate function definitions function generate_function_definitions() { if (function_count == 0) return print "# --- User Functions ---" # Build dispatch table print "# Dispatch table" print "BEGIN {" for (i = 1; i <= function_count; i++) { internal_name = "__lambda_" (i - 1) arg_count = split(FUNCTION_ARGS[i], args_array, ",") print " RAWK_DISPATCH[\"" FUNCTION_NAMES[i] "\"] = \"" internal_name "|" arg_count "|" FUNCTION_LINES[i] "\"" } print "}" print "" # Generate function definitions for (i = 1; i <= function_count; i++) { internal_name = "__lambda_" (i - 1) body = FUNCTION_BODIES[i] # Replace recursive calls for (j = 1; j <= function_count; j++) { gsub(FUNCTION_NAMES[j] "\\(", "__lambda_" (j - 1) "(", body) } print "function " internal_name "(" FUNCTION_ARGS[i] ") {" if (FUNCTION_TYPES[i] == "single") { print " return " body } else { print body } print "}" print "" } } # Generate main script body function generate_main_script() { print "# --- Main Script Body ---" # Check if there's already a BEGIN block has_begin = 0 for (i = 1; i <= main_script_count; i++) { if (main_script_lines[i] ~ /^[ \t]*BEGIN[ \t]*\{/) { has_begin = 1 break } } if (has_begin) { # Print lines as-is for (i = 1; i <= main_script_count; i++) { line = main_script_lines[i] # Replace function calls for (j = 1; j <= function_count; j++) { gsub(FUNCTION_NAMES[j] "\\(", "__lambda_" (j - 1) "(", line) } print line } } else { # Wrap in BEGIN block print "BEGIN {" for (i = 1; i <= main_script_count; i++) { line = main_script_lines[i] # Replace function calls for (j = 1; j <= function_count; j++) { gsub(FUNCTION_NAMES[j] "\\(", "__lambda_" (j - 1) "(", line) } print " " line } print "}" } } function report_validation_error(message, line_num, line, suggestion) { print "❌ " message > "/dev/stderr" print " at line " line_num " in " FILENAME > "/dev/stderr" print " context: " line > "/dev/stderr" if (suggestion != "") { print " 💡 " suggestion > "/dev/stderr" } print "" > "/dev/stderr" validation_errors++ } function report_validation_warning(message, line_num, line, suggestion) { print "⚠️ " message > "/dev/stderr" print " at line " line_num " in " FILENAME > "/dev/stderr" print " context: " line > "/dev/stderr" if (suggestion != "") { print " 💡 " suggestion > "/dev/stderr" } print "" > "/dev/stderr" validation_warnings++ } # TODO: think through ways to add more passes to enhance compiler error messages function report_error(message, line_num, line, suggestion) { print "❌ rawk compilation error: " message > "/dev/stderr" print " at line " line_num " in " FILENAME > "/dev/stderr" print " context: " line > "/dev/stderr" if (suggestion != "") { print " 💡 " suggestion > "/dev/stderr" } print "" > "/dev/stderr" error_count++ errors++ } function report_warning(message, line_num, line, suggestion) { print "⚠️ rawk compilation warning: " message > "/dev/stderr" print " at line " line_num " in " FILENAME > "/dev/stderr" print " context: " line > "/dev/stderr" if (suggestion != "") { print " 💡 " suggestion > "/dev/stderr" } print "" > "/dev/stderr" warning_count++ warnings++ } # END block to generate final output END { # Check if any validation errors occurred if (validation_errors > 0) { print "" > "/dev/stderr" print "📊 Validation Summary" > "/dev/stderr" print "====================" > "/dev/stderr" print "Total Lines: " line_count > "/dev/stderr" print "Errors: " validation_errors > "/dev/stderr" print "Warnings: " validation_warnings > "/dev/stderr" print "❌ Syntax validation failed! Exiting without code generation." > "/dev/stderr" exit 1 } # Generate standard library generate_standard_library() # Generate function definitions generate_function_definitions() # Generate main script body generate_main_script() # Add compilation metadata print "# Rawk compilation summary:" print "# - Rawk Version: " RAWK_VERSION print "# - Functions defined: " functions_defined print "# - Source lines: " line_count print "# - Errors: " errors print "# - Warnings: " warnings print "" }