about summary refs log tree commit diff stats
path: root/awk/rawk/example.rawk
diff options
context:
space:
mode:
Diffstat (limited to 'awk/rawk/example.rawk')
-rw-r--r--awk/rawk/example.rawk182
1 files changed, 182 insertions, 0 deletions
diff --git a/awk/rawk/example.rawk b/awk/rawk/example.rawk
new file mode 100644
index 0000000..950f5e9
--- /dev/null
+++ b/awk/rawk/example.rawk
@@ -0,0 +1,182 @@
+ # Main processing pipeline
+ BEGIN {
+     print "Apache Log Analysis Report"
+     print "============================="
+     print ""
+ }
+
+ RAWK {
+     # Helper functions for parsing and analysis
+     $extract_method = (request) -> {
+         split(request, parts, " ")
+         return parts[1]
+     };
+     
+     $extract_url = (request) -> {
+         split(request, parts, " ")
+         return parts[2]
+     };
+     
+     $format_error_report = (ip, status, url, user_agent) -> {
+         return ip " - " status " - " url " (" user_agent ")"
+     };
+     
+     $format_success_report = (ip, method, url, bytes) -> {
+         return ip " - " method " " url " (" bytes " bytes)"
+     };
+     
+     $is_success = (status) -> {
+         return status >= 200 && status < 300
+     };
+     
+     $is_api_request = (url) -> {
+         return index(url, "/api/") > 0
+     };
+     
+     $is_large_request = (bytes) -> {
+         return bytes > 1048576  # 1MB
+     };
+     
+     # Functional programming examples
+     $extract_endpoint = (url) -> {
+         return url
+     };
+     
+     $extract_bot_components = (user_agent, result) -> {
+         split(user_agent, result, " ")
+         return length(result)
+     };
+ }
+
+ # Process each log line
+ {
+     # Parse Apache log format: IP - - [timestamp] "method url status" bytes "referer" "user-agent"
+     # Note that we use a series of simpler regex matches, rather than trying to do it all at once
+     if (match($0, /^([0-9.]+)/)) {
+         ip = substr($0, RSTART, RLENGTH)
+     
+         # Extract request (method url protocol)
+         if (match($0, /"([^"]+)"/)) {
+             request = substr($0, RSTART + 1, RLENGTH - 2)
+             # Extract method and URL from request
+             method = extract_method(request)
+             url = extract_url(request)
+         }
+     
+         # Extract status code (number after the request)
+         if (match($0, /" ([0-9]+) /)) {
+             status = substr($0, RSTART + 1, RLENGTH - 2)
+             # Remove leading/trailing spaces
+             gsub(/^[ \t]+|[ \t]+$/, "", status)
+         }
+     
+         # Extract bytes (number after request)
+         if (match($0, /" ([0-9]+) /)) {
+             bytes = substr($0, RSTART + 1, RLENGTH - 2)
+         }
+     
+         # Extract user agent (last quoted field)
+         if (match($0, /"([^"]*)"$/)) {
+             user_agent = substr($0, RSTART + 1, RLENGTH - 2)
+         }
+     
+         # Store for analysis
+         request_count++
+     
+         # Real-time processing using some standard library predicates
+         if (http_is_server_error(status)) {
+             server_error_count++
+             error_report = format_error_report(ip, status, url, user_agent)
+             print "SERVER ERROR: " error_report
+         } else if (http_is_client_error(status)) {
+             client_error_count++
+             error_report = format_error_report(ip, status, url, user_agent)
+             print "CLIENT ERROR: " error_report
+         } else if (is_success(status)) {
+             success_count++
+             success_report = format_success_report(ip, method, url, bytes)
+             print "✓ " success_report
+         }
+     
+         # Track different types of requests
+         if (is_api_request(url)) {
+             api_count++
+             api_urls[api_count] = url
+         }
+     
+         if (url_is_static_file(url)) {
+             static_count++
+             static_urls[static_count] = url
+         }
+     
+         if (http_is_mutating_method(method)) {
+             mutation_count++
+             if (ip_is_public(ip)) {
+                 print "EXTERNAL MUTATION: " ip " " method " " url
+             }
+         }
+     
+         # Track user types
+         if (is_bot(user_agent)) {
+             bot_count++
+             bot_agents[bot_count] = user_agent
+         } else if (user_agent_is_mobile(user_agent)) {
+             mobile_count++
+         } else if (user_agent_is_desktop(user_agent)) {
+             desktop_count++
+         }
+     
+         # Track large requests
+         if (is_large_request(bytes)) {
+             large_count++
+             large_urls[large_count] = url
+         }
+     }
+ }
+
+ END {
+     print ""
+     print "Summary Statistics"
+     print "===================="
+     print "Total Requests:", request_count
+     print "Successful:", success_count
+     print "Client Errors:", client_error_count
+     print "Server Errors:", server_error_count
+     print "Total Errors:", client_error_count + server_error_count
+     print "Error Rate:", sprintf("%.2f%%", ((client_error_count + server_error_count) / request_count) * 100)
+     print "API Requests:", api_count
+     print "Static Files:", static_count
+     print "Mutating Requests:", mutation_count
+     print "Mobile Users:", mobile_count
+     print "Desktop Users:", desktop_count
+     print "Bot Requests:", bot_count
+     print "Large Requests (>1MB):", large_count
+     
+     # Some functional patterns at play, map, flatMap, and take.
+     if (api_count > 0) {
+         print ""
+         print "API Usage Analysis"
+         print "===================="
+         
+         # Use map to extract API endpoints
+         endpoint_count = map("extract_endpoint", api_urls, endpoints)
+         print "API Endpoints found:", endpoint_count
+     }
+     
+     if (bot_count > 0) {
+         print ""
+         print "Bot Activity Analysis"
+         print "========================"
+         
+         # Use flatMap to extract bot user agent components
+         bot_components_count = flatMap("extract_bot_components", bot_agents, bot_components)
+         print "Bot components analyzed:", bot_components_count
+         
+         # Use take to show top 3 bot components
+         top_components_count = take(3, bot_components, top_components)
+         print "Top bot components:", top_components_count
+     }
+     
+     print ""
+     print "End analysis"
+ } 
\ No newline at end of file