diff options
Diffstat (limited to 'awk/rawk/better-predicates.awk')
-rw-r--r-- | awk/rawk/better-predicates.awk | 244 |
1 files changed, 0 insertions, 244 deletions
diff --git a/awk/rawk/better-predicates.awk b/awk/rawk/better-predicates.awk deleted file mode 100644 index a9a08f6..0000000 --- a/awk/rawk/better-predicates.awk +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/awk -f - -# -# AWK Predicate Functions (POSIX-Compliant) -# -# This script provides and demonstrates several predicate functions for validation. -# -# This version is strictly POSIX-compliant. -# - -# --- Predicate Functions --- - -# is_csv(line, _fs_orig, _nf_orig, _comma_count, _quote_count) -# Description: -# Determines if a given line is likely a CSV record. -# This is a heuristic-based check. It's not a full-blown CSV parser. -function is_csv(line, _fs_orig, _nf_orig, _comma_count, _quote_count) { - if (index(line, ",") == 0) { return 0 } - _quote_count = gsub(/"/, "&", line) - if (_quote_count % 2 != 0) { return 0 } - _fs_orig = FS - _nf_orig = NF - FS = "," - $0 = line - _comma_count = NF - FS = _fs_orig - $0 = $0 - return (_comma_count > 1) ? 1 : 0 -} - -# is_tsv(line, _fs_orig, _nf_orig, _tab_count) -# Description: -# Determines if a given line is likely a TSV record. -function is_tsv(line, _fs_orig, _nf_orig, _tab_count) { - if (index(line, "\t") == 0) { return 0 } - _fs_orig = FS - _nf_orig = NF - FS = "\t" - $0 = line - _tab_count = NF - FS = _fs_orig - $0 = $0 - return (_tab_count > 1) ? 1 : 0 -} - -# is_uuid(str) -# Description: -# Checks if a string conforms to common UUID formats. -function is_uuid(str) { - if (str ~ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/) { return 1 } - if (str ~ /^[0-9a-fA-F]{32}$/) { return 1 } - if (str ~ /^urn:uuid:[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/) { return 1 } - return 0 -} - -# is_email(str) -# Description: -# Performs a robust check for email format validity, inspired by RFC 5322. -# It checks for a single '@', validates local and domain parts, and disallows common errors. -function is_email(str) { - # An email must contain exactly one '@' symbol. - if (gsub(/@/, "&") != 1) { return 0 } - - # Split into local and domain parts - split(str, parts, "@") - local_part = parts[1] - domain_part = parts[2] - - # Local and domain parts must not be empty. - if (length(local_part) == 0 || length(domain_part) == 0) { return 0 } - - # Basic local part validation: No spaces, and check for valid characters. - # POSIX awk regex is limited, so we check for invalid things instead of just valid. - if (local_part ~ /[ ]/ || local_part ~ /^"|"$|^\.|\.$|\.\./) { return 0 } - - # Domain part validation - # Must contain at least one dot. - if (index(domain_part, ".") == 0) { return 0 } - # Cannot have consecutive dots, or leading/trailing dots. - if (domain_part ~ /\.\./ || domain_part ~ /^\./ || domain_part ~ /\.$/) { return 0 } - # TLD must be at least 2 characters long. - split(domain_part, domain_parts, ".") - if (length(domain_parts[length(domain_parts)]) < 2) { return 0 } - # Domain parts should only contain letters, numbers, and hyphens. - if (domain_part ~ /[^a-zA-Z0-9\.-]/) { return 0 } - - # A more comprehensive regex for the whole structure - if (str ~ /^[a-zA-Z0-9!#$%&'*+\/=?^_`{|}~.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/) { - return 1 - } - - return 0 -} - -# is_url(str) -# Description: -# Validates a URL against common schemes and structure. -# Supports http, https, ftp, ftps, mailto, tel. -function is_url(str) { - # Regex to match common URL schemes, host, optional port, path, and query. - # This is a practical, not exhaustive, regex. - if (str ~ /^(https|http|ftp|ftps|mailto|tel):(\/\/)?([a-zA-Z0-9\.-]+)(:[0-9]+)?(\/.*)?(\?.*)?$/) { - # Extra check for http/https to ensure they have slashes - if ((str ~ /^http/ || str ~ /^ftp/) && str !~ /:\/\//) { - return 0 - } - return 1 - } - return 0 -} - -# is_ipv6(str, _addr, _parts, _i, _num_parts, _empty_found, _total_segments) -# Description: -# Validates an IPv6 address, handling compressed zeros (::) and interface identifiers. -function is_ipv6(str) { - # Handle optional interface identifier (e.g., %eth0) - _addr = str - if (index(_addr, "%") > 0) { - split(_addr, parts, "%") - _addr = parts[1] - } - - # An IPv6 address cannot contain more than one "::" - if (gsub(/::/, "&") > 1) { return 0 } - - # Check for "::" at the end of the string - _has_trailing_colon = 0 - if (substr(_addr, length(_addr)) == ":" && substr(_addr, length(_addr) - 1) != "::") { - return 0 # Invalid trailing colon - } - if (substr(_addr, length(_addr) - 1) == "::") { - _has_trailing_colon = 1 - } - - _num_parts = split(_addr, _parts, ":") - _empty_found = 0 - - if (_addr ~ /::/) { - _empty_found = 1 - } - - _total_segments = _num_parts - if (_has_trailing_colon) { - _total_segments-- - } - - for (_i = 1; _i <= _num_parts; _i++) { - if (length(_parts[_i]) == 0) { - continue # Part of the :: compression - } - # Each segment must be a valid hex number between 1 and 4 characters. - if (_parts[_i] !~ /^[0-9a-fA-F]{1,4}$/) { return 0 } - } - - if (_empty_found) { - if (_total_segments > 7) { return 0 } - } else { - if (_total_segments != 8) { return 0 } - } - - return 1 -} - -# is_palindrome(str, _clean_str, _len, _i) -# Description: -# Checks if a string is a palindrome, ignoring case, whitespace, and punctuation. -function is_palindrome(str) { - _clean_str = tolower(str) - gsub(/[^a-z0-9]/, "", _clean_str) - _len = length(_clean_str) - if (_len == 0) { return 1 } # Empty string is a palindrome - for (_i = 1; _i <= _len / 2; _i++) { - if (substr(_clean_str, _i, 1) != substr(_clean_str, _len - _i + 1, 1)) { - return 0 - } - } - return 1 -} - -# is_hex(str) -# Description: -# Checks if a string is a valid hexadecimal value, optionally with a 0x or # prefix. -function is_hex(str) { - _test_str = str - # Strip optional prefixes - if (substr(_test_str, 1, 2) == "0x" || substr(_test_str, 1, 2) == "0X") { - _test_str = substr(_test_str, 3) - } else if (substr(_test_str, 1, 1) == "#") { - _test_str = substr(_test_str, 2) - } - if (length(_test_str) == 0) { return 0 } # Prefix only is not valid - return (_test_str ~ /^[0-9a-fA-F]+$/) ? 1 : 0 -} - - -# --- Demonstration Block --- -BEGIN { - print "--- Testing is_csv() ---" - csv_tests["valid1"] = "name,age,city"; result = is_csv(csv_tests["valid1"]) ? "CSV" : "Not CSV"; printf("Test '%s': [%s] -> %s\n", "valid1", csv_tests["valid1"], result) - csv_tests["valid2"] = "\"Smith, John\",30,\"New York\""; result = is_csv(csv_tests["valid2"]) ? "CSV" : "Not CSV"; printf("Test '%s': [%s] -> %s\n", "valid2", csv_tests["valid2"], result) - csv_tests["invalid1"] = "this is a normal sentence"; result = is_csv(csv_tests["invalid1"]) ? "CSV" : "Not CSV"; printf("Test '%s': [%s] -> %s\n", "invalid1", csv_tests["invalid1"], result) - - print "\n--- Testing is_tsv() ---" - tsv_tests["valid1"] = "name\tage\tcity"; result = is_tsv(tsv_tests["valid1"]) ? "TSV" : "Not TSV"; printf("Test '%s': [%s] -> %s\n", "valid1", tsv_tests["valid1"], result) - tsv_tests["invalid1"] = "this is a normal sentence"; result = is_tsv(tsv_tests["invalid1"]) ? "TSV" : "Not TSV"; printf("Test '%s': [%s] -> %s\n", "invalid1", tsv_tests["invalid1"], result) - - print "\n--- Testing is_uuid() ---" - uuid_tests["valid_hyphens"] = "123e4567-e89b-12d3-a456-426614174000"; result = is_uuid(uuid_tests["valid_hyphens"]) ? "UUID" : "Not UUID"; printf("Test '%s': [%s] -> %s\n", "valid_hyphens", uuid_tests["valid_hyphens"], result) - uuid_tests["invalid_short"] = "123e4567-e89b"; result = is_uuid(uuid_tests["invalid_short"]) ? "UUID" : "Not UUID"; printf("Test '%s': [%s] -> %s\n", "invalid_short", uuid_tests["invalid_short"], result) - - print "\n--- Testing is_email() ---" - email_tests["valid"] = "test.user+alias@domain.co.uk"; result = is_email(email_tests["valid"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "valid", email_tests["valid"], result) - email_tests["invalid_no_at"] = "test.domain.com"; result = is_email(email_tests["invalid_no_at"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_no_at", email_tests["invalid_no_at"], result) - email_tests["invalid_double_at"] = "user@@domain.com"; result = is_email(email_tests["invalid_double_at"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_double_at", email_tests["invalid_double_at"], result) - email_tests["invalid_spaces"] = "user name@domain.com"; result = is_email(email_tests["invalid_spaces"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_spaces", email_tests["invalid_spaces"], result) - email_tests["invalid_double_dot"] = "user@domain..com"; result = is_email(email_tests["invalid_double_dot"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_double_dot", email_tests["invalid_double_dot"], result) - - print "\n--- Testing is_url() ---" - url_tests["valid_https"] = "https://www.example.com/path?query=1"; result = is_url(url_tests["valid_https"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "valid_https", url_tests["valid_https"], result) - url_tests["valid_ftp"] = "ftp://user:pass@ftp.example.com"; result = is_url(url_tests["valid_ftp"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "valid_ftp", url_tests["valid_ftp"], result) - url_tests["valid_mailto"] = "mailto:user@example.com"; result = is_url(url_tests["valid_mailto"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "valid_mailto", url_tests["valid_mailto"], result) - url_tests["invalid_scheme"] = "htp://example.com"; result = is_url(url_tests["invalid_scheme"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "invalid_scheme", url_tests["invalid_scheme"], result) - url_tests["invalid_incomplete"] = "http://"; result = is_url(url_tests["invalid_incomplete"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "invalid_incomplete", url_tests["invalid_incomplete"], result) - - print "\n--- Testing is_ipv6() ---" - ipv6_tests["valid_full"] = "2001:0db8:85a3:0000:0000:8a2e:0370:7334"; result = is_ipv6(ipv6_tests["valid_full"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "valid_full", ipv6_tests["valid_full"], result) - ipv6_tests["valid_compressed"] = "2001:db8::1"; result = is_ipv6(ipv6_tests["valid_compressed"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "valid_compressed", ipv6_tests["valid_compressed"], result) - ipv6_tests["valid_interface"] = "fe80::1%eth0"; result = is_ipv6(ipv6_tests["valid_interface"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "valid_interface", ipv6_tests["valid_interface"], result) - ipv6_tests["invalid_double_compress"] = "2001:db8::1::2"; result = is_ipv6(ipv6_tests["invalid_double_compress"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "invalid_double_compress", ipv6_tests["invalid_double_compress"], result) - - print "\n--- Testing is_palindrome() ---" - palindrome_tests["valid_racecar"] = "Racecar"; result = is_palindrome(palindrome_tests["valid_racecar"]) ? "Palindrome" : "Not Palindrome"; printf("Test '%s': [%s] -> %s\n", "valid_racecar", palindrome_tests["valid_racecar"], result) - palindrome_tests["valid_madam"] = "A man, a plan, a canal: Panama"; result = is_palindrome(palindrome_tests["valid_madam"]) ? "Palindrome" : "Not Palindrome"; printf("Test '%s': [%s] -> %s\n", "valid_madam", palindrome_tests["valid_madam"], result) - palindrome_tests["invalid"] = "hello world"; result = is_palindrome(palindrome_tests["invalid"]) ? "Palindrome" : "Not Palindrome"; printf("Test '%s': [%s] -> %s\n", "invalid", palindrome_tests["invalid"], result) - - print "\n--- Testing is_hex() ---" - hex_tests["valid_simple"] = "deadbeef"; result = is_hex(hex_tests["valid_simple"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "valid_simple", hex_tests["valid_simple"], result) - hex_tests["valid_0x"] = "0xDEADBEEF"; result = is_hex(hex_tests["valid_0x"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "valid_0x", hex_tests["valid_0x"], result) - hex_tests["valid_hash"] = "#ff0000"; result = is_hex(hex_tests["valid_hash"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "valid_hash", hex_tests["valid_hash"], result) - hex_tests["invalid_chars"] = "0xGHIJKL"; result = is_hex(hex_tests["invalid_chars"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "invalid_chars", hex_tests["invalid_chars"], result) - - # Exit after demonstration to prevent processing stdin - exit -} |