about summary refs log tree commit diff stats
path: root/awk/rawk/better-predicates.awk
diff options
context:
space:
mode:
Diffstat (limited to 'awk/rawk/better-predicates.awk')
-rw-r--r--awk/rawk/better-predicates.awk244
1 files changed, 0 insertions, 244 deletions
diff --git a/awk/rawk/better-predicates.awk b/awk/rawk/better-predicates.awk
deleted file mode 100644
index a9a08f6..0000000
--- a/awk/rawk/better-predicates.awk
+++ /dev/null
@@ -1,244 +0,0 @@
-#!/usr/bin/awk -f
-
-#
-# AWK Predicate Functions (POSIX-Compliant)
-#
-# This script provides and demonstrates several predicate functions for validation.
-#
-# This version is strictly POSIX-compliant.
-#
-
-# --- Predicate Functions ---
-
-# is_csv(line, _fs_orig, _nf_orig, _comma_count, _quote_count)
-# Description:
-#   Determines if a given line is likely a CSV record.
-#   This is a heuristic-based check. It's not a full-blown CSV parser.
-function is_csv(line,  _fs_orig, _nf_orig, _comma_count, _quote_count) {
-    if (index(line, ",") == 0) { return 0 }
-    _quote_count = gsub(/"/, "&", line)
-    if (_quote_count % 2 != 0) { return 0 }
-    _fs_orig = FS
-    _nf_orig = NF
-    FS = ","
-    $0 = line
-    _comma_count = NF
-    FS = _fs_orig
-    $0 = $0
-    return (_comma_count > 1) ? 1 : 0
-}
-
-# is_tsv(line, _fs_orig, _nf_orig, _tab_count)
-# Description:
-#   Determines if a given line is likely a TSV record.
-function is_tsv(line, _fs_orig, _nf_orig, _tab_count) {
-    if (index(line, "\t") == 0) { return 0 }
-    _fs_orig = FS
-    _nf_orig = NF
-    FS = "\t"
-    $0 = line
-    _tab_count = NF
-    FS = _fs_orig
-    $0 = $0
-    return (_tab_count > 1) ? 1 : 0
-}
-
-# is_uuid(str)
-# Description:
-#   Checks if a string conforms to common UUID formats.
-function is_uuid(str) {
-    if (str ~ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/) { return 1 }
-    if (str ~ /^[0-9a-fA-F]{32}$/) { return 1 }
-    if (str ~ /^urn:uuid:[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/) { return 1 }
-    return 0
-}
-
-# is_email(str)
-# Description:
-#   Performs a robust check for email format validity, inspired by RFC 5322.
-#   It checks for a single '@', validates local and domain parts, and disallows common errors.
-function is_email(str) {
-    # An email must contain exactly one '@' symbol.
-    if (gsub(/@/, "&") != 1) { return 0 }
-
-    # Split into local and domain parts
-    split(str, parts, "@")
-    local_part = parts[1]
-    domain_part = parts[2]
-
-    # Local and domain parts must not be empty.
-    if (length(local_part) == 0 || length(domain_part) == 0) { return 0 }
-
-    # Basic local part validation: No spaces, and check for valid characters.
-    # POSIX awk regex is limited, so we check for invalid things instead of just valid.
-    if (local_part ~ /[ ]/ || local_part ~ /^"|"$|^\.|\.$|\.\./) { return 0 }
-    
-    # Domain part validation
-    # Must contain at least one dot.
-    if (index(domain_part, ".") == 0) { return 0 }
-    # Cannot have consecutive dots, or leading/trailing dots.
-    if (domain_part ~ /\.\./ || domain_part ~ /^\./ || domain_part ~ /\.$/) { return 0 }
-    # TLD must be at least 2 characters long.
-    split(domain_part, domain_parts, ".")
-    if (length(domain_parts[length(domain_parts)]) < 2) { return 0 }
-    # Domain parts should only contain letters, numbers, and hyphens.
-    if (domain_part ~ /[^a-zA-Z0-9\.-]/) { return 0 }
-
-    # A more comprehensive regex for the whole structure
-    if (str ~ /^[a-zA-Z0-9!#$%&'*+\/=?^_`{|}~.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/) {
-        return 1
-    }
-
-    return 0
-}
-
-# is_url(str)
-# Description:
-#   Validates a URL against common schemes and structure.
-#   Supports http, https, ftp, ftps, mailto, tel.
-function is_url(str) {
-    # Regex to match common URL schemes, host, optional port, path, and query.
-    # This is a practical, not exhaustive, regex.
-    if (str ~ /^(https|http|ftp|ftps|mailto|tel):(\/\/)?([a-zA-Z0-9\.-]+)(:[0-9]+)?(\/.*)?(\?.*)?$/) {
-        # Extra check for http/https to ensure they have slashes
-        if ((str ~ /^http/ || str ~ /^ftp/) && str !~ /:\/\//) {
-            return 0
-        }
-        return 1
-    }
-    return 0
-}
-
-# is_ipv6(str, _addr, _parts, _i, _num_parts, _empty_found, _total_segments)
-# Description:
-#   Validates an IPv6 address, handling compressed zeros (::) and interface identifiers.
-function is_ipv6(str) {
-    # Handle optional interface identifier (e.g., %eth0)
-    _addr = str
-    if (index(_addr, "%") > 0) {
-        split(_addr, parts, "%")
-        _addr = parts[1]
-    }
-
-    # An IPv6 address cannot contain more than one "::"
-    if (gsub(/::/, "&") > 1) { return 0 }
-
-    # Check for "::" at the end of the string
-    _has_trailing_colon = 0
-    if (substr(_addr, length(_addr)) == ":" && substr(_addr, length(_addr) - 1) != "::") {
-        return 0 # Invalid trailing colon
-    }
-    if (substr(_addr, length(_addr) - 1) == "::") {
-        _has_trailing_colon = 1
-    }
-
-    _num_parts = split(_addr, _parts, ":")
-    _empty_found = 0
-    
-    if (_addr ~ /::/) {
-        _empty_found = 1
-    }
-
-    _total_segments = _num_parts
-    if (_has_trailing_colon) {
-        _total_segments--
-    }
-
-    for (_i = 1; _i <= _num_parts; _i++) {
-        if (length(_parts[_i]) == 0) {
-            continue # Part of the :: compression
-        }
-        # Each segment must be a valid hex number between 1 and 4 characters.
-        if (_parts[_i] !~ /^[0-9a-fA-F]{1,4}$/) { return 0 }
-    }
-
-    if (_empty_found) {
-        if (_total_segments > 7) { return 0 }
-    } else {
-        if (_total_segments != 8) { return 0 }
-    }
-
-    return 1
-}
-
-# is_palindrome(str, _clean_str, _len, _i)
-# Description:
-#   Checks if a string is a palindrome, ignoring case, whitespace, and punctuation.
-function is_palindrome(str) {
-    _clean_str = tolower(str)
-    gsub(/[^a-z0-9]/, "", _clean_str)
-    _len = length(_clean_str)
-    if (_len == 0) { return 1 } # Empty string is a palindrome
-    for (_i = 1; _i <= _len / 2; _i++) {
-        if (substr(_clean_str, _i, 1) != substr(_clean_str, _len - _i + 1, 1)) {
-            return 0
-        }
-    }
-    return 1
-}
-
-# is_hex(str)
-# Description:
-#   Checks if a string is a valid hexadecimal value, optionally with a 0x or # prefix.
-function is_hex(str) {
-    _test_str = str
-    # Strip optional prefixes
-    if (substr(_test_str, 1, 2) == "0x" || substr(_test_str, 1, 2) == "0X") {
-        _test_str = substr(_test_str, 3)
-    } else if (substr(_test_str, 1, 1) == "#") {
-        _test_str = substr(_test_str, 2)
-    }
-    if (length(_test_str) == 0) { return 0 } # Prefix only is not valid
-    return (_test_str ~ /^[0-9a-fA-F]+$/) ? 1 : 0
-}
-
-
-# --- Demonstration Block ---
-BEGIN {
-    print "--- Testing is_csv() ---"
-    csv_tests["valid1"] = "name,age,city"; result = is_csv(csv_tests["valid1"]) ? "CSV" : "Not CSV"; printf("Test '%s': [%s] -> %s\n", "valid1", csv_tests["valid1"], result)
-    csv_tests["valid2"] = "\"Smith, John\",30,\"New York\""; result = is_csv(csv_tests["valid2"]) ? "CSV" : "Not CSV"; printf("Test '%s': [%s] -> %s\n", "valid2", csv_tests["valid2"], result)
-    csv_tests["invalid1"] = "this is a normal sentence"; result = is_csv(csv_tests["invalid1"]) ? "CSV" : "Not CSV"; printf("Test '%s': [%s] -> %s\n", "invalid1", csv_tests["invalid1"], result)
-    
-    print "\n--- Testing is_tsv() ---"
-    tsv_tests["valid1"] = "name\tage\tcity"; result = is_tsv(tsv_tests["valid1"]) ? "TSV" : "Not TSV"; printf("Test '%s': [%s] -> %s\n", "valid1", tsv_tests["valid1"], result)
-    tsv_tests["invalid1"] = "this is a normal sentence"; result = is_tsv(tsv_tests["invalid1"]) ? "TSV" : "Not TSV"; printf("Test '%s': [%s] -> %s\n", "invalid1", tsv_tests["invalid1"], result)
-
-    print "\n--- Testing is_uuid() ---"
-    uuid_tests["valid_hyphens"] = "123e4567-e89b-12d3-a456-426614174000"; result = is_uuid(uuid_tests["valid_hyphens"]) ? "UUID" : "Not UUID"; printf("Test '%s': [%s] -> %s\n", "valid_hyphens", uuid_tests["valid_hyphens"], result)
-    uuid_tests["invalid_short"] = "123e4567-e89b"; result = is_uuid(uuid_tests["invalid_short"]) ? "UUID" : "Not UUID"; printf("Test '%s': [%s] -> %s\n", "invalid_short", uuid_tests["invalid_short"], result)
-
-    print "\n--- Testing is_email() ---"
-    email_tests["valid"] = "test.user+alias@domain.co.uk"; result = is_email(email_tests["valid"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "valid", email_tests["valid"], result)
-    email_tests["invalid_no_at"] = "test.domain.com"; result = is_email(email_tests["invalid_no_at"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_no_at", email_tests["invalid_no_at"], result)
-    email_tests["invalid_double_at"] = "user@@domain.com"; result = is_email(email_tests["invalid_double_at"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_double_at", email_tests["invalid_double_at"], result)
-    email_tests["invalid_spaces"] = "user name@domain.com"; result = is_email(email_tests["invalid_spaces"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_spaces", email_tests["invalid_spaces"], result)
-    email_tests["invalid_double_dot"] = "user@domain..com"; result = is_email(email_tests["invalid_double_dot"]) ? "Email" : "Not Email"; printf("Test '%s': [%s] -> %s\n", "invalid_double_dot", email_tests["invalid_double_dot"], result)
-
-    print "\n--- Testing is_url() ---"
-    url_tests["valid_https"] = "https://www.example.com/path?query=1"; result = is_url(url_tests["valid_https"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "valid_https", url_tests["valid_https"], result)
-    url_tests["valid_ftp"] = "ftp://user:pass@ftp.example.com"; result = is_url(url_tests["valid_ftp"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "valid_ftp", url_tests["valid_ftp"], result)
-    url_tests["valid_mailto"] = "mailto:user@example.com"; result = is_url(url_tests["valid_mailto"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "valid_mailto", url_tests["valid_mailto"], result)
-    url_tests["invalid_scheme"] = "htp://example.com"; result = is_url(url_tests["invalid_scheme"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "invalid_scheme", url_tests["invalid_scheme"], result)
-    url_tests["invalid_incomplete"] = "http://"; result = is_url(url_tests["invalid_incomplete"]) ? "URL" : "Not URL"; printf("Test '%s': [%s] -> %s\n", "invalid_incomplete", url_tests["invalid_incomplete"], result)
-
-    print "\n--- Testing is_ipv6() ---"
-    ipv6_tests["valid_full"] = "2001:0db8:85a3:0000:0000:8a2e:0370:7334"; result = is_ipv6(ipv6_tests["valid_full"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "valid_full", ipv6_tests["valid_full"], result)
-    ipv6_tests["valid_compressed"] = "2001:db8::1"; result = is_ipv6(ipv6_tests["valid_compressed"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "valid_compressed", ipv6_tests["valid_compressed"], result)
-    ipv6_tests["valid_interface"] = "fe80::1%eth0"; result = is_ipv6(ipv6_tests["valid_interface"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "valid_interface", ipv6_tests["valid_interface"], result)
-    ipv6_tests["invalid_double_compress"] = "2001:db8::1::2"; result = is_ipv6(ipv6_tests["invalid_double_compress"]) ? "IPv6" : "Not IPv6"; printf("Test '%s': [%s] -> %s\n", "invalid_double_compress", ipv6_tests["invalid_double_compress"], result)
-    
-    print "\n--- Testing is_palindrome() ---"
-    palindrome_tests["valid_racecar"] = "Racecar"; result = is_palindrome(palindrome_tests["valid_racecar"]) ? "Palindrome" : "Not Palindrome"; printf("Test '%s': [%s] -> %s\n", "valid_racecar", palindrome_tests["valid_racecar"], result)
-    palindrome_tests["valid_madam"] = "A man, a plan, a canal: Panama"; result = is_palindrome(palindrome_tests["valid_madam"]) ? "Palindrome" : "Not Palindrome"; printf("Test '%s': [%s] -> %s\n", "valid_madam", palindrome_tests["valid_madam"], result)
-    palindrome_tests["invalid"] = "hello world"; result = is_palindrome(palindrome_tests["invalid"]) ? "Palindrome" : "Not Palindrome"; printf("Test '%s': [%s] -> %s\n", "invalid", palindrome_tests["invalid"], result)
-
-    print "\n--- Testing is_hex() ---"
-    hex_tests["valid_simple"] = "deadbeef"; result = is_hex(hex_tests["valid_simple"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "valid_simple", hex_tests["valid_simple"], result)
-    hex_tests["valid_0x"] = "0xDEADBEEF"; result = is_hex(hex_tests["valid_0x"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "valid_0x", hex_tests["valid_0x"], result)
-    hex_tests["valid_hash"] = "#ff0000"; result = is_hex(hex_tests["valid_hash"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "valid_hash", hex_tests["valid_hash"], result)
-    hex_tests["invalid_chars"] = "0xGHIJKL"; result = is_hex(hex_tests["invalid_chars"]) ? "Hex" : "Not Hex"; printf("Test '%s': [%s] -> %s\n", "invalid_chars", hex_tests["invalid_chars"], result)
-
-    # Exit after demonstration to prevent processing stdin
-    exit
-}