diff options
Diffstat (limited to 'awk/rawk/example.rawk')
-rw-r--r-- | awk/rawk/example.rawk | 359 |
1 files changed, 179 insertions, 180 deletions
diff --git a/awk/rawk/example.rawk b/awk/rawk/example.rawk index bda56b7..950f5e9 100644 --- a/awk/rawk/example.rawk +++ b/awk/rawk/example.rawk @@ -1,183 +1,182 @@ -# This demonstrates most rawk features in a setting familiar to awk -# Usage: awk -f rawk.awk example.rawk | awk -f - sample.log + # Main processing pipeline + BEGIN { + print "Apache Log Analysis Report" + print "=============================" + print "" + } -# User defined predicate functions for log analysis -# This shows off rawk's single line function syntax -$is_error = (status) -> status >= 400; -$is_success = (status) -> status >= 200 && status < 300; -$is_large_request = (bytes) -> bytes > 1000000; # > 1MB -$is_api_request = (url) -> index(url, "/api/") > 0; -$is_bot = (user_agent) -> index(user_agent, "bot") > 0 || index(user_agent, "crawler") > 0; + RAWK { + # Helper functions for parsing and analysis + $extract_method = (request) -> { + split(request, parts, " ") + return parts[1] + }; + + $extract_url = (request) -> { + split(request, parts, " ") + return parts[2] + }; + + $format_error_report = (ip, status, url, user_agent) -> { + return ip " - " status " - " url " (" user_agent ")" + }; + + $format_success_report = (ip, method, url, bytes) -> { + return ip " - " method " " url " (" bytes " bytes)" + }; + + $is_success = (status) -> { + return status >= 200 && status < 300 + }; + + $is_api_request = (url) -> { + return index(url, "/api/") > 0 + }; + + $is_large_request = (bytes) -> { + return bytes > 1048576 # 1MB + }; + + # Functional programming examples + $extract_endpoint = (url) -> { + return url + }; + + $extract_bot_components = (user_agent, result) -> { + split(user_agent, result, " ") + return length(result) + }; + } -# Data parsing and transformation functions -# These show off rawk's multi-line arrow functions -$extract_status = (request_line) -> { - split(request_line, parts, " ") - # The status code is the second part, not the third - return parts[2] -}; + # Process each log line + { + # Parse Apache log format: IP - - [timestamp] "method url status" bytes "referer" "user-agent" + # Note that we use a series of simpler regex matches, rather than trying to do it all at once + if (match($0, /^([0-9.]+)/)) { + ip = substr($0, RSTART, RLENGTH) + + # Extract request (method url protocol) + if (match($0, /"([^"]+)"/)) { + request = substr($0, RSTART + 1, RLENGTH - 2) + # Extract method and URL from request + method = extract_method(request) + url = extract_url(request) + } + + # Extract status code (number after the request) + if (match($0, /" ([0-9]+) /)) { + status = substr($0, RSTART + 1, RLENGTH - 2) + # Remove leading/trailing spaces + gsub(/^[ \t]+|[ \t]+$/, "", status) + } + + # Extract bytes (number after request) + if (match($0, /" ([0-9]+) /)) { + bytes = substr($0, RSTART + 1, RLENGTH - 2) + } + + # Extract user agent (last quoted field) + if (match($0, /"([^"]*)"$/)) { + user_agent = substr($0, RSTART + 1, RLENGTH - 2) + } + + # Store for analysis + request_count++ + + # Real-time processing using some standard library predicates + if (http_is_server_error(status)) { + server_error_count++ + error_report = format_error_report(ip, status, url, user_agent) + print "SERVER ERROR: " error_report + } else if (http_is_client_error(status)) { + client_error_count++ + error_report = format_error_report(ip, status, url, user_agent) + print "CLIENT ERROR: " error_report + } else if (is_success(status)) { + success_count++ + success_report = format_success_report(ip, method, url, bytes) + print "✓ " success_report + } + + # Track different types of requests + if (is_api_request(url)) { + api_count++ + api_urls[api_count] = url + } + + if (url_is_static_file(url)) { + static_count++ + static_urls[static_count] = url + } + + if (http_is_mutating_method(method)) { + mutation_count++ + if (ip_is_public(ip)) { + print "EXTERNAL MUTATION: " ip " " method " " url + } + } + + # Track user types + if (is_bot(user_agent)) { + bot_count++ + bot_agents[bot_count] = user_agent + } else if (user_agent_is_mobile(user_agent)) { + mobile_count++ + } else if (user_agent_is_desktop(user_agent)) { + desktop_count++ + } + + # Track large requests + if (is_large_request(bytes)) { + large_count++ + large_urls[large_count] = url + } + } + } -$extract_method = (request_line) -> { - split(request_line, parts, " ") - return parts[1] -}; - -$extract_url = (request_line) -> { - split(request_line, parts, " ") - return parts[2] -}; - -# Aggregation and reporting functions -$format_error_report = (ip, status, url, user_agent) -> { - return "ERROR: " status " - " ip " accessed " url " (" user_agent ")" -}; - -$format_success_report = (ip, method, url, bytes) -> { - size_label = is_large_request(bytes) ? "LARGE" : "normal" - return "SUCCESS: " method " " url " (" bytes " bytes, " size_label ")" -}; - -# Main processing pipeline -BEGIN { - print "Apache Log Analysis Report" - print "=============================" - print "" -} - -# Process each log line -{ - # Parse Apache log format: IP - - [timestamp] "method url status" bytes "referer" "user-agent" - # Note that we use a series of simpler regex matches, rather than trying to do it all at once - if (match($0, /^([0-9.]+)/)) { - ip = substr($0, RSTART, RLENGTH) - - # Extract request (method url protocol) - if (match($0, /"([^"]+)"/)) { - request = substr($0, RSTART + 1, RLENGTH - 2) - # Extract method and URL from request - method = extract_method(request) - url = extract_url(request) - } - - # Extract status code (number after the request) - if (match($0, /" ([0-9]+) /)) { - status = substr($0, RSTART + 1, RLENGTH - 2) - # Remove leading/trailing spaces - gsub(/^[ \t]+|[ \t]+$/, "", status) - } - - # Extract bytes (number after request) - if (match($0, /" ([0-9]+) /)) { - bytes = substr($0, RSTART + 1, RLENGTH - 2) - } - - # Extract user agent (last quoted field) - if (match($0, /"([^"]*)"$/)) { - user_agent = substr($0, RSTART + 1, RLENGTH - 2) - } - - # Store for analysis - request_count++ - - # Real-time processing using some standard library predicates - if (http_is_server_error(status)) { - server_error_count++ - error_report = format_error_report(ip, status, url, user_agent) - print "SERVER ERROR: " error_report - } else if (http_is_client_error(status)) { - client_error_count++ - error_report = format_error_report(ip, status, url, user_agent) - print "CLIENT ERROR: " error_report - } else if (is_success(status)) { - success_count++ - success_report = format_success_report(ip, method, url, bytes) - print "✓ " success_report - } - - # Track different types of requests - if (is_api_request(url)) { - api_count++ - api_urls[api_count] = url - } - - if (url_is_static_file(url)) { - static_count++ - static_urls[static_count] = url - } - - if (http_is_mutating_method(method)) { - mutation_count++ - if (ip_is_public(ip)) { - print "EXTERNAL MUTATION: " ip " " method " " url - } - } - - # Track user types - if (is_bot(user_agent)) { - bot_count++ - bot_agents[bot_count] = user_agent - } else if (user_agent_is_mobile(user_agent)) { - mobile_count++ - } else if (user_agent_is_desktop(user_agent)) { - desktop_count++ - } - - # Track large requests - if (is_large_request(bytes)) { - large_count++ - large_urls[large_count] = url - } - } -} - -END { - print "" - print "Summary Statistics" - print "====================" - print "Total Requests:", request_count - print "Successful:", success_count - print "Client Errors:", client_error_count - print "Server Errors:", server_error_count - print "Total Errors:", client_error_count + server_error_count - print "Error Rate:", sprintf("%.2f%%", ((client_error_count + server_error_count) / request_count) * 100) - print "API Requests:", api_count - print "Static Files:", static_count - print "Mutating Requests:", mutation_count - print "Mobile Users:", mobile_count - print "Desktop Users:", desktop_count - print "Bot Requests:", bot_count - print "Large Requests (>1MB):", large_count - - # Some functional patterns at play, map, flatMap, and take. - if (api_count > 0) { - print "" - print "API Usage Analysis" - print "====================" - - # Use map to extract API endpoints - $extract_endpoint = (url) -> url; - endpoint_count = map("extract_endpoint", api_urls, endpoints) - print "API Endpoints found:", endpoint_count - } - - if (bot_count > 0) { - print "" - print "Bot Activity Analysis" - print "========================" - - # Use flatMap to extract bot user agent components - $extract_bot_components = (user_agent, result) -> { - split(user_agent, result, " ") - return length(result) - }; - bot_components_count = flatMap("extract_bot_components", bot_agents, bot_components) - print "Bot components analyzed:", bot_components_count - - # Use take to show top 3 bot components - top_components_count = take(3, bot_components, top_components) - print "Top bot components:", top_components_count - } - - print "" - print "End analysis" -} \ No newline at end of file + END { + print "" + print "Summary Statistics" + print "====================" + print "Total Requests:", request_count + print "Successful:", success_count + print "Client Errors:", client_error_count + print "Server Errors:", server_error_count + print "Total Errors:", client_error_count + server_error_count + print "Error Rate:", sprintf("%.2f%%", ((client_error_count + server_error_count) / request_count) * 100) + print "API Requests:", api_count + print "Static Files:", static_count + print "Mutating Requests:", mutation_count + print "Mobile Users:", mobile_count + print "Desktop Users:", desktop_count + print "Bot Requests:", bot_count + print "Large Requests (>1MB):", large_count + + # Some functional patterns at play, map, flatMap, and take. + if (api_count > 0) { + print "" + print "API Usage Analysis" + print "====================" + + # Use map to extract API endpoints + endpoint_count = map("extract_endpoint", api_urls, endpoints) + print "API Endpoints found:", endpoint_count + } + + if (bot_count > 0) { + print "" + print "Bot Activity Analysis" + print "========================" + + # Use flatMap to extract bot user agent components + bot_components_count = flatMap("extract_bot_components", bot_agents, bot_components) + print "Bot components analyzed:", bot_components_count + + # Use take to show top 3 bot components + top_components_count = take(3, bot_components, top_components) + print "Top bot components:", top_components_count + } + + print "" + print "End analysis" + } \ No newline at end of file |