about summary refs log tree commit diff stats
path: root/awk/rawk/example.rawk
diff options
context:
space:
mode:
Diffstat (limited to 'awk/rawk/example.rawk')
-rw-r--r--awk/rawk/example.rawk359
1 files changed, 179 insertions, 180 deletions
diff --git a/awk/rawk/example.rawk b/awk/rawk/example.rawk
index bda56b7..950f5e9 100644
--- a/awk/rawk/example.rawk
+++ b/awk/rawk/example.rawk
@@ -1,183 +1,182 @@
-# This demonstrates most rawk features in a setting familiar to awk
-# Usage: awk -f rawk.awk example.rawk | awk -f - sample.log
+ # Main processing pipeline
+ BEGIN {
+     print "Apache Log Analysis Report"
+     print "============================="
+     print ""
+ }
 
-# User defined predicate functions for log analysis
-# This shows off rawk's single line function syntax
-$is_error = (status) -> status >= 400;
-$is_success = (status) -> status >= 200 && status < 300;
-$is_large_request = (bytes) -> bytes > 1000000;  # > 1MB
-$is_api_request = (url) -> index(url, "/api/") > 0;
-$is_bot = (user_agent) -> index(user_agent, "bot") > 0 || index(user_agent, "crawler") > 0;
+ RAWK {
+     # Helper functions for parsing and analysis
+     $extract_method = (request) -> {
+         split(request, parts, " ")
+         return parts[1]
+     };
+     
+     $extract_url = (request) -> {
+         split(request, parts, " ")
+         return parts[2]
+     };
+     
+     $format_error_report = (ip, status, url, user_agent) -> {
+         return ip " - " status " - " url " (" user_agent ")"
+     };
+     
+     $format_success_report = (ip, method, url, bytes) -> {
+         return ip " - " method " " url " (" bytes " bytes)"
+     };
+     
+     $is_success = (status) -> {
+         return status >= 200 && status < 300
+     };
+     
+     $is_api_request = (url) -> {
+         return index(url, "/api/") > 0
+     };
+     
+     $is_large_request = (bytes) -> {
+         return bytes > 1048576  # 1MB
+     };
+     
+     # Functional programming examples
+     $extract_endpoint = (url) -> {
+         return url
+     };
+     
+     $extract_bot_components = (user_agent, result) -> {
+         split(user_agent, result, " ")
+         return length(result)
+     };
+ }
 
-# Data parsing and transformation functions
-# These show off rawk's multi-line arrow functions
-$extract_status = (request_line) -> {
-    split(request_line, parts, " ")
-    # The status code is the second part, not the third
-    return parts[2]
-};
+ # Process each log line
+ {
+     # Parse Apache log format: IP - - [timestamp] "method url status" bytes "referer" "user-agent"
+     # Note that we use a series of simpler regex matches, rather than trying to do it all at once
+     if (match($0, /^([0-9.]+)/)) {
+         ip = substr($0, RSTART, RLENGTH)
+     
+         # Extract request (method url protocol)
+         if (match($0, /"([^"]+)"/)) {
+             request = substr($0, RSTART + 1, RLENGTH - 2)
+             # Extract method and URL from request
+             method = extract_method(request)
+             url = extract_url(request)
+         }
+     
+         # Extract status code (number after the request)
+         if (match($0, /" ([0-9]+) /)) {
+             status = substr($0, RSTART + 1, RLENGTH - 2)
+             # Remove leading/trailing spaces
+             gsub(/^[ \t]+|[ \t]+$/, "", status)
+         }
+     
+         # Extract bytes (number after request)
+         if (match($0, /" ([0-9]+) /)) {
+             bytes = substr($0, RSTART + 1, RLENGTH - 2)
+         }
+     
+         # Extract user agent (last quoted field)
+         if (match($0, /"([^"]*)"$/)) {
+             user_agent = substr($0, RSTART + 1, RLENGTH - 2)
+         }
+     
+         # Store for analysis
+         request_count++
+     
+         # Real-time processing using some standard library predicates
+         if (http_is_server_error(status)) {
+             server_error_count++
+             error_report = format_error_report(ip, status, url, user_agent)
+             print "SERVER ERROR: " error_report
+         } else if (http_is_client_error(status)) {
+             client_error_count++
+             error_report = format_error_report(ip, status, url, user_agent)
+             print "CLIENT ERROR: " error_report
+         } else if (is_success(status)) {
+             success_count++
+             success_report = format_success_report(ip, method, url, bytes)
+             print "✓ " success_report
+         }
+     
+         # Track different types of requests
+         if (is_api_request(url)) {
+             api_count++
+             api_urls[api_count] = url
+         }
+     
+         if (url_is_static_file(url)) {
+             static_count++
+             static_urls[static_count] = url
+         }
+     
+         if (http_is_mutating_method(method)) {
+             mutation_count++
+             if (ip_is_public(ip)) {
+                 print "EXTERNAL MUTATION: " ip " " method " " url
+             }
+         }
+     
+         # Track user types
+         if (is_bot(user_agent)) {
+             bot_count++
+             bot_agents[bot_count] = user_agent
+         } else if (user_agent_is_mobile(user_agent)) {
+             mobile_count++
+         } else if (user_agent_is_desktop(user_agent)) {
+             desktop_count++
+         }
+     
+         # Track large requests
+         if (is_large_request(bytes)) {
+             large_count++
+             large_urls[large_count] = url
+         }
+     }
+ }
 
-$extract_method = (request_line) -> {
-    split(request_line, parts, " ")
-    return parts[1]
-};
-
-$extract_url = (request_line) -> {
-    split(request_line, parts, " ")
-    return parts[2]
-};
-
-# Aggregation and reporting functions
-$format_error_report = (ip, status, url, user_agent) -> {
-    return "ERROR: " status " - " ip " accessed " url " (" user_agent ")"
-};
-
-$format_success_report = (ip, method, url, bytes) -> {
-    size_label = is_large_request(bytes) ? "LARGE" : "normal"
-    return "SUCCESS: " method " " url " (" bytes " bytes, " size_label ")"
-};
-
-# Main processing pipeline
-BEGIN {
-    print "Apache Log Analysis Report"
-    print "============================="
-    print ""
-}
-
-# Process each log line
-{
-    # Parse Apache log format: IP - - [timestamp] "method url status" bytes "referer" "user-agent"
-    # Note that we use a series of simpler regex matches, rather than trying to do it all at once
-    if (match($0, /^([0-9.]+)/)) {
-        ip = substr($0, RSTART, RLENGTH)
-        
-        # Extract request (method url protocol)
-        if (match($0, /"([^"]+)"/)) {
-            request = substr($0, RSTART + 1, RLENGTH - 2)
-            # Extract method and URL from request
-            method = extract_method(request)
-            url = extract_url(request)
-        }
-        
-        # Extract status code (number after the request)
-        if (match($0, /" ([0-9]+) /)) {
-            status = substr($0, RSTART + 1, RLENGTH - 2)
-            # Remove leading/trailing spaces
-            gsub(/^[ \t]+|[ \t]+$/, "", status)
-        }
-        
-        # Extract bytes (number after request)
-        if (match($0, /" ([0-9]+) /)) {
-            bytes = substr($0, RSTART + 1, RLENGTH - 2)
-        }
-        
-        # Extract user agent (last quoted field)
-        if (match($0, /"([^"]*)"$/)) {
-            user_agent = substr($0, RSTART + 1, RLENGTH - 2)
-        }
-        
-        # Store for analysis
-        request_count++
-        
-        # Real-time processing using some standard library predicates
-        if (http_is_server_error(status)) {
-            server_error_count++
-            error_report = format_error_report(ip, status, url, user_agent)
-            print "SERVER ERROR: " error_report
-        } else if (http_is_client_error(status)) {
-            client_error_count++
-            error_report = format_error_report(ip, status, url, user_agent)
-            print "CLIENT ERROR: " error_report
-        } else if (is_success(status)) {
-            success_count++
-            success_report = format_success_report(ip, method, url, bytes)
-            print "✓ " success_report
-        }
-        
-        # Track different types of requests
-        if (is_api_request(url)) {
-            api_count++
-            api_urls[api_count] = url
-        }
-        
-        if (url_is_static_file(url)) {
-            static_count++
-            static_urls[static_count] = url
-        }
-        
-        if (http_is_mutating_method(method)) {
-            mutation_count++
-            if (ip_is_public(ip)) {
-                print "EXTERNAL MUTATION: " ip " " method " " url
-            }
-        }
-        
-        # Track user types
-        if (is_bot(user_agent)) {
-            bot_count++
-            bot_agents[bot_count] = user_agent
-        } else if (user_agent_is_mobile(user_agent)) {
-            mobile_count++
-        } else if (user_agent_is_desktop(user_agent)) {
-            desktop_count++
-        }
-        
-        # Track large requests
-        if (is_large_request(bytes)) {
-            large_count++
-            large_urls[large_count] = url
-        }
-    }
-}
-
-END {
-    print ""
-    print "Summary Statistics"
-    print "===================="
-    print "Total Requests:", request_count
-    print "Successful:", success_count
-    print "Client Errors:", client_error_count
-    print "Server Errors:", server_error_count
-    print "Total Errors:", client_error_count + server_error_count
-    print "Error Rate:", sprintf("%.2f%%", ((client_error_count + server_error_count) / request_count) * 100)
-    print "API Requests:", api_count
-    print "Static Files:", static_count
-    print "Mutating Requests:", mutation_count
-    print "Mobile Users:", mobile_count
-    print "Desktop Users:", desktop_count
-    print "Bot Requests:", bot_count
-    print "Large Requests (>1MB):", large_count
-    
-    # Some functional patterns at play, map, flatMap, and take.
-    if (api_count > 0) {
-        print ""
-        print "API Usage Analysis"
-        print "===================="
-        
-        # Use map to extract API endpoints
-        $extract_endpoint = (url) -> url;
-        endpoint_count = map("extract_endpoint", api_urls, endpoints)
-        print "API Endpoints found:", endpoint_count
-    }
-    
-    if (bot_count > 0) {
-        print ""
-        print "Bot Activity Analysis"
-        print "========================"
-        
-        # Use flatMap to extract bot user agent components
-        $extract_bot_components = (user_agent, result) -> {
-            split(user_agent, result, " ")
-            return length(result)
-        };
-        bot_components_count = flatMap("extract_bot_components", bot_agents, bot_components)
-        print "Bot components analyzed:", bot_components_count
-        
-        # Use take to show top 3 bot components
-        top_components_count = take(3, bot_components, top_components)
-        print "Top bot components:", top_components_count
-    }
-    
-    print ""
-    print "End analysis"
-} 
\ No newline at end of file
+ END {
+     print ""
+     print "Summary Statistics"
+     print "===================="
+     print "Total Requests:", request_count
+     print "Successful:", success_count
+     print "Client Errors:", client_error_count
+     print "Server Errors:", server_error_count
+     print "Total Errors:", client_error_count + server_error_count
+     print "Error Rate:", sprintf("%.2f%%", ((client_error_count + server_error_count) / request_count) * 100)
+     print "API Requests:", api_count
+     print "Static Files:", static_count
+     print "Mutating Requests:", mutation_count
+     print "Mobile Users:", mobile_count
+     print "Desktop Users:", desktop_count
+     print "Bot Requests:", bot_count
+     print "Large Requests (>1MB):", large_count
+     
+     # Some functional patterns at play, map, flatMap, and take.
+     if (api_count > 0) {
+         print ""
+         print "API Usage Analysis"
+         print "===================="
+         
+         # Use map to extract API endpoints
+         endpoint_count = map("extract_endpoint", api_urls, endpoints)
+         print "API Endpoints found:", endpoint_count
+     }
+     
+     if (bot_count > 0) {
+         print ""
+         print "Bot Activity Analysis"
+         print "========================"
+         
+         # Use flatMap to extract bot user agent components
+         bot_components_count = flatMap("extract_bot_components", bot_agents, bot_components)
+         print "Bot components analyzed:", bot_components_count
+         
+         # Use take to show top 3 bot components
+         top_components_count = take(3, bot_components, top_components)
+         print "Top bot components:", top_components_count
+     }
+     
+     print ""
+     print "End analysis"
+ } 
\ No newline at end of file