# See https://url.spec.whatwg.org/#url-parsing. import strutils import tables import options import unicode import math import utils/twtstr type UrlState = enum SCHEME_START_STATE, SCHEME_STATE, NO_SCHEME_STATE, FILE_STATE, SPECIAL_RELATIVE_OR_AUTHORITY_STATE, SPECIAL_AUTHORITY_SLASHES_STATE, PATH_OR_AUTHORITY_STATE, OPAQUE_PATH_STATE, FRAGMENT_STATE, RELATIVE_STATE, SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE, AUTHORITY_STATE, PATH_STATE, RELATIVE_SLASH_STATE, QUERY_STATE, HOST_STATE, HOSTNAME_STATE, FILE_HOST_STATE, PORT_STATE, PATH_START_STATE, FILE_SLASH_STATE Blob* = object BlobUrlEntry* = object obj: Blob #TODO UrlPath* = object case opaque*: bool of true: s*: string else: ss*: seq[string] Host = object domain: string ipv4: Option[uint32] ipv6: Option[array[8, uint16]] opaquehost: string Url* = object encoding: int #TODO scheme*: string username: string password: string port: Option[uint16] host: Option[Host] path*: UrlPath query*: Option[string] fragment: Option[string] blob: Option[BlobUrlEntry] const EmptyPath = UrlPath(opaque: true, s: "") const EmptyHost = Host(domain: "").some const SpecialSchemes = { "ftp": 21u16.some, "file": none(uint16), "http": 80u16.some, "https": 443u16.some, "ws": 80u16.some, "wss": 443u16.some, }.toTable() func parseIpv6(input: string): Option[array[8, uint16]] = var pieceindex = 0 var compress = -1 var pointer = 0 var address: array[8, uint16] template c(i = 0): char = input[pointer + i] template has(i = 0): bool = (pointer + i < input.len) template failure(): Option[array[8, uint16]] = none(array[8, uint16]) if c == ':': if not has(1) or c(1) != ':': #TODO validation error return failure pointer += 2 inc pieceindex compress = pieceindex while has: if pieceindex == 8: #TODO validation error return failure if c == ':': if compress != -1: #TODO validation error return failure inc pointer inc pieceindex compress = pieceindex continue var value: uint16 = 0 var length = 0 while length < 4 and has and c in AsciiHexDigit: value = value * 0x10 + uint16(c.hexValue) inc pointer inc length if has and c == '.': if length == 0: #TODO validation error return failure pointer -= length if pieceindex > 6: #TODO validation error return failure var numbersseen = 0 while has: var ipv4piece = -1 if numbersseen > 0: if c == '.' and numbersseen < 4: inc pointer else: #TODO validation error return failure if not has or c notin Digits: #TODO validation error return failure while has and c in Digits: if ipv4piece == -1: ipv4piece = c.decValue elif ipv4piece == 0: #TODO validation error return failure else: ipv4piece = ipv4piece * 10 + c.decValue if ipv4piece > 255: #TODO validation error return failure inc pointer address[pieceindex] = address[pieceindex] * 0x100 + uint16(ipv4piece) inc numbersseen if numbersseen == 2 or numbersseen == 4: inc pieceindex if numbersseen != 4: #TODO validation error return failure break elif has and c == ':': inc pointer if not has: #TODO validation error return failure address[pieceindex] = value inc pieceindex if compress != -1: var swaps = pieceindex - compress pieceindex = 7 while pieceindex != 0 and swaps > 0: let sp = address[pieceindex] address[pieceindex] = address[compress + swaps - 1] address[compress + swaps - 1] = sp dec pieceindex dec swaps elif pieceindex != 8: #TODO validation error return failure return address.some func parseIpv4Number(s: string): tuple[num: int, validationError: bool] = if s == "": return (-1, true) var input = s var R = 10 var validationerror = false if input.len >= 2 and input[0] == '0': if input[1] in {'x', 'X'}: validationerror = true input = input.substr(2) R = 16 else: validationerror = true input = input.substr(1) R = 8 if input == "": return (0, true) var output = 0 try: case R of 8: output = parseOctInt(input) of 10: output = parseInt(input) of 16: output = parseHexInt(input) else: discard except ValueError: return (-1, true) return (output, validationerror) func parseIpv4(input: string): Option[uint32] = var validationerror = false var parts = input.split('.') if parts[^1] == "": validationerror = true if parts.len > 1: discard parts.pop() if parts.len > 4: #TODO validation error return none(uint32) var numbers: seq[int] for i in low(parts)..high(parts): let part = parts[i] let pp = parseIpv4Number(part) if pp[0] < 0: #TODO validation error return none(uint32) if pp[0] > 255: validationerror = true if i != high(parts): #TODO validation error return none(uint32) if pp[1]: validationerror = true numbers.add(pp[0]) #TODO validation error if validationerror if numbers[^1] >= 256^(5-numbers.len): #TODO validation error return none(uint32) var ipv4 = uint32(numbers[^1]) discard numbers.pop() for i in 0..numbers.high: let n = uint32(numbers[i]) ipv4 += n * (256u32 ^ (3 - i)) return ipv4.some const ForbiddenHostChars = { chr(0x00), '\t', '\n', '\r', ' ', '#', '%', '/', ':', '<', '>', '?', '@', '[', '\\', ']', '^', '|' } func opaqueParseHost(input: string): Option[Host] = for c in input: if c in ForbiddenHostChars: return none(Host) #TODO If input contains a code point that is not a URL code point and not #U+0025 (%), validation error. #TODO If input contains a U+0025 (%) and the two code points following it #are not ASCII hex digits, validation error. var o = "" for c in input: o.percentEncode(c, ControlPercentEncodeSet) func endsInNumber(input: string): bool = var parts = input.split('.') if parts[^1] == "": if parts.len == 1: return false discard parts.pop() if parts.len == 0: return false var last = parts[^1] if last != "": if last.len == 2 and last[0] in Digits and last[1].tolower() == 'x': last = last.substr(2) for c in last: if c notin Digits: return false return true return false func domainToAscii*(domain: string, bestrict = false): Option[string] = var needsprocessing = false for s in domain.split('.'): var i = 0 var xn = 0 while i < s.len: if s[i] notin Ascii: needsprocessing = true break case i of 0: if s[i] == 'x': inc xn of 1: if s[i] == 'n': inc xn of 2: if s[i] == '-': inc xn of 3: if s[i] == '-' and xn == 3: needsprocessing = true break else: discard inc i if needsprocessing: break if bestrict or needsprocessing: #Note: we don't implement STD3 separately, it's always true result = domain.unicodeToAscii(false, true, true, false, bestrict) if result.isnone or result.get == "": #TODO validation error return none(string) return result else: return domain.toAsciiLower().some func parseHost(input: string, isnotspecial = false): Option[Host] = if input.len == 0: return if input[0] == '[': if input[^1] != ']': #TODO validation error return none(Host) return Host(ipv6: parseIpv6(input.substr(1, input.high - 1))).some if isnotspecial: #TODO ?? return opaqueParseHost(input) let domain = percentDecode(input) let asciiDomain = domain.domainToAscii() if asciiDomain.isnone: #TODO validation error return none(Host) for c in asciiDomain.get: if c in ForbiddenHostChars: #TODO validation error return none(Host) if asciiDomain.get.len > 0 and asciiDomain.get.endsInNumber(): let ipv4 = parseIpv4(asciiDomain.get) return Host(ipv4: ipv4).some return Host(domain: asciiDomain.get).some func isempty(host: Host): bool = return host.domain == "" and host.ipv4.isnone and host.ipv6.isnone and host.opaquehost == "" proc shorten_path(url: var Url) {.inline.} = assert not url.path.opaque if url.scheme == "file" and url.path.ss.len == 1 and url.path.ss[0][0] in Letters and url.path.ss[0][1] == ':': return if url.path.ss.len > 0: discard url.path.ss.pop() proc append(path: var UrlPath, s: string) = if path.opaque: path.s &= s else: path.ss.add(s) template includes_credentials(url: Url): bool = url.username != "" or url.password != "" template is_windows_drive_letter(s: string): bool = s.len == 2 and s[0] in Letters and (s[1] == ':' or s[1] == '|') #TODO encoding proc basicParseUrl*(input: string, base = none(Url), url: var Url = Url(), override: bool = false): Option[Url] = #TODO If input contains any leading or trailing C0 control or space, validation error. #TODO If input contains any ASCII tab or newline, validation error. let input = input.strip(true, false, {chr(0x00)..chr(0x1F), ' '}).strip(true, false, {'\t', '\n'}) var buffer = "" var atsignseen = false var insidebrackets = false var passwordtokenseen = false var pointer = 0 var state = SCHEME_START_STATE template c(i = 0): char = input[pointer + i] template has(i = 0): bool = (pointer + i < input.len) template is_special(url: Url): bool = url.scheme in SpecialSchemes template default_port(url: Url): Option[uint16] = SpecialSchemes[url.scheme] template start_over() = pointer = -1 template starts_with_windows_drive_letter(s: string): bool = s.len >= 2 and s[0] in Letters and (s[1] == ':' or s[1] == '|') template is_normalized_windows_drive_letter(s: string): bool = s.len == 2 and s[0] in Letters and (s[1] == ':') template is_windows_drive_letter(s: string): bool = s.len == 2 and s[0] in Letters and (s[1] == ':' or s[1] == '|') template is_double_dot_path_segment(s: string): bool = s == ".." or s.equalsIgnoreCase(".%2e") or s.equalsIgnoreCase("%2e.") or s.equalsIgnoreCase("%2e%2e") template is_single_dot_path_segment(s: string): bool = s == "." or s.equalsIgnoreCase("%2e") template is_empty(path: UrlPath): bool = path.ss.len == 0 while pointer <= input.len: case state of SCHEME_START_STATE: if has and c.isAlphaAscii(): buffer &= c.tolower() state = SCHEME_STATE elif not override: state = NO_SCHEME_STATE dec pointer else: #TODO validation error return none(Url) of SCHEME_STATE: if has and c in Letters + {'+', '-', '.'}: buffer &= c.tolower() elif has and c == ':': if override: if url.scheme in SpecialSchemes and buffer notin SpecialSchemes: return url.some if url.scheme notin SpecialSchemes and buffer in SpecialSchemes: return url.some if (url.includes_credentials or url.port.issome) and buffer == "file": return url.some if url.scheme == "file" and url.host.get.isempty: return url.some url.scheme = buffer if override: if url.default_port == url.port: url.port = none(uint16) return url.some buffer = "" if url.scheme == "file": #TODO If remaining does not start with "//", validation error. state = FILE_STATE elif url.is_special and not base.isnone and base.get.scheme == url.scheme: state = SPECIAL_RELATIVE_OR_AUTHORITY_STATE elif url.is_special: state = SPECIAL_AUTHORITY_SLASHES_STATE elif has(1) and c(1) == '/': state = PATH_OR_AUTHORITY_STATE else: url.path = EmptyPath state = OPAQUE_PATH_STATE elif not override: buffer = "" state = NO_SCHEME_STATE start_over else: #TODO validation error return none(Url) of NO_SCHEME_STATE: if base.isnone or base.get.path.opaque and (not has or c != '#'): #TODO validation error return none(Url) elif base.get.path.opaque and has and c == '#': url.scheme = base.get.scheme url.path = base.get.path url.query = base.get.query url.fragment = "".some state = FRAGMENT_STATE elif base.get.scheme != "file": state = RELATIVE_STATE dec pointer else: state = FILE_STATE dec pointer of SPECIAL_RELATIVE_OR_AUTHORITY_STATE: if has(1) and c == '/' and c(1) == '/': state = SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE inc pointer else: #TODO validation error state = RELATIVE_STATE dec pointer of PATH_OR_AUTHORITY_STATE: if c == '/': state = AUTHORITY_STATE else: state = PATH_STATE dec pointer of RELATIVE_STATE: assert base.get.scheme != "file" url.scheme = base.get.scheme if has and c == '/': state = RELATIVE_SLASH_STATE elif url.is_special and has and c == '\\': #TODO validation error state = RELATIVE_SLASH_STATE else: url.username = base.get.username url.password = base.get.password url.host = base.get.host url.port = base.get.port url.path = base.get.path url.query = base.get.query if has and c == '?': url.query = "".some state = QUERY_STATE elif has and c == '#': url.fragment = "".some state = FRAGMENT_STATE else: url.query = none(string) url.shorten_path() state = PATH_STATE dec pointer of RELATIVE_SLASH_STATE: if url.is_special and has and c in {'/', '\\'}: #TODO if c is \ validation error state = SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE elif has and c == '/': state = AUTHORITY_STATE else: url.username = base.get.username url.password = base.get.password url.host = base.get.host url.port = base.get.port state = PATH_STATE dec pointer of SPECIAL_AUTHORITY_SLASHES_STATE: if has(1) and c == '/' and c(1) == '/': state = SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE inc pointer else: #TODO validation error state = SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE dec pointer of SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE: if has and c notin {'/', '\\'}: state = AUTHORITY_STATE dec pointer else: #TODO validation error discard of AUTHORITY_STATE: if has and c == '@': #TODO validation error if atsignseen: buffer = "%40" & buffer atsignseen = true var i = 0 while i < buffer.len: if c == ':' and not passwordtokenseen: passwordtokenseen = true inc i continue if passwordtokenseen: url.password.percentEncode(c, UserInfoPercentEncodeSet) else: url.username.percentEncode(c, UserInfoPercentEncodeSet) buffer = "" elif not has or c in {'/', '?', '#'} or (url.is_special and c == '\\'): if atsignseen and buffer == "": #TODO validation error return none(Url) pointer -= buffer.len + 1 buffer = "" state = HOST_STATE else: buffer &= c of HOST_STATE, HOSTNAME_STATE: if override and url.scheme == "file": dec pointer state = FILE_HOST_STATE elif has and c == ':' and not insidebrackets: if buffer == "": #TODO validation error return none(Url) let host = parseHost(buffer) if host.isnone: return none(Url) url.host = host buffer = "" state = PORT_STATE elif (not has or c in {'/', '?', '#'}) or (url.is_special and c == '\\'): dec pointer if url.is_special and buffer == "": #TODO validation error return none(Url) elif override and buffer == "" and (url.includes_credentials or url.port.issome): return let host = parseHost(buffer) if host.isnone: return none(Url) url.host = host buffer = "" state = PATH_START_STATE if override: return else: if c == '[': insidebrackets = true elif c == ']': insidebrackets = false buffer &= c of PORT_STATE: if has and c in Digits: buffer &= c elif (not has or c in {'/', '?', '#'}) or (url.is_special and c == '\\') or override: if buffer != "": let i = parseInt32(buffer) if i notin 0..65535: #TODO validation error return none(Url) let port = cast[uint16](i).some url.port = if url.default_port == port: none(uint16) else: port buffer = "" if override: return state = PATH_START_STATE dec pointer else: #TODO validation error return none(Url) of FILE_STATE: url.scheme = "file" url.host = EmptyHost if has and (c == '/' or c == '\\'): #TODO if c == '\\' validation error state = FILE_SLASH_STATE elif base.issome and base.get.scheme == "file": url.host = base.get.host url.path = base.get.path url.query = base.get.query if has: if c == '?': url.query = "".some state = QUERY_STATE elif c == '#': url.fragment = "".some state = FRAGMENT_STATE else: url.query = none(string) if not input.substr(pointer).starts_with_windows_drive_letter(): url.shorten_path() else: #TODO validation error url.path.ss.setLen(0) state = PATH_STATE dec pointer else: state = PATH_STATE dec pointer of FILE_SLASH_STATE: if has and (c == '/' or c == '\\'): #TODO if c == '\\' validation error state = FILE_HOST_STATE else: if base.issome and base.get.scheme == "file": url.host = base.get.host let bpath = base.get.path.ss if not input.substr(pointer).starts_with_windows_drive_letter() and bpath.len > 0 and bpath[0].is_normalized_windows_drive_letter(): url.path.append(bpath[0]) state = PATH_STATE dec pointer of FILE_HOST_STATE: if (not has or c in {'/', '\\', '?', '#'}): dec pointer if not override and buffer.is_windows_drive_letter: #TODO validation error state = PATH_STATE elif buffer == "": url.host = Host(domain: "").some if override: return state = PATH_START_STATE else: var host = parseHost(buffer) if host.isnone: return none(Url) if host.get.domain == "localhost": host.get.domain = "" url.host = host if override: return buffer = "" state = PATH_START_STATE else: buffer &= c of PATH_START_STATE: if url.is_special: #TODO if c == '\\' validation error state = PATH_STATE if not has or c notin {'/', '\\'}: dec pointer elif not override and has and c == '?': url.query = "".some state = QUERY_STATE elif not override and has and c == '#': url.fragment = "".some state = FRAGMENT_STATE elif has: state = PATH_STATE if c != '/': dec pointer elif override and url.host.isnone: url.path.append("") of PATH_STATE: if not has or c == '/' or (url.is_special and c == '\\') or (not override and c in {'?', '#'}): #TODO if url.is_special and c == '\\' validation error let slash_cond = not has or (c != '/' and not (url.is_special and c == '\\')) if buffer.is_double_dot_path_segment: url.shorten_path() if slash_cond: url.path.append("") elif buffer.is_single_dot_path_segment and slash_cond: url.path.append("") elif not buffer.is_single_dot_path_segment: if url.scheme == "file" and url.path.is_empty and buffer.is_windows_drive_letter: buffer[1] = ':' url.path.append(buffer) buffer = "" if has: if c == '?': url.query = "".some state = QUERY_STATE elif c == '#': url.fragment = "".some state = FRAGMENT_STATE else: #TODO If c is not a URL code point and not U+0025 (%), validation error. #TODO If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. buffer.percentEncode(c, PathPercentEncodeSet) of OPAQUE_PATH_STATE: if has: if c == '?': url.query = "".some state = QUERY_STATE elif c == '#': url.fragment = "".some state = FRAGMENT_STATE else: #TODO If c is not the EOF code point, not a URL code point, and not U+0025 (%), validation error. #TODO If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. url.path.append(percentEncode(c, ControlPercentEncodeSet)) of QUERY_STATE: #TODO encoding if not has or (not override and c == '#'): let querypercentencodeset = if url.is_special: SpecialQueryPercentEncodeSet else: QueryPercentEncodeSet url.query.get.percentEncode(buffer, querypercentencodeset) buffer = "" if has and c == '#': url.fragment = "".some state = FRAGMENT_STATE elif has: #TODO If c is not a URL code point and not U+0025 (%), validation error. #TODO If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. buffer &= c of FRAGMENT_STATE: if has: #TODO If c is not a URL code point and not U+0025 (%), validation error. #TODO If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. url.fragment.get.percentEncode(c, FragmentPercentEncodeSet) inc pointer return url.some func anchor*(url: Url): string = if url.fragment.issome: return url.fragment.get return "" proc parseUrl*(input: string, base = none(Url), url: var Url, override: bool = false): Option[Url] = var url = basicParseUrl(input, base, url, override) if url.isnone: return url if url.get.scheme != "blob": return url url.get.blob = BlobUrlEntry().some return url proc parseUrl*(input: string, base = none(Url), override: bool = false): Option[Url] = var url = Url().some url = basicParseUrl(input, base, url.get, override) if url.isnone: return url if url.get.scheme != "blob": return url url.get.blob = BlobUrlEntry().some return url func serializeip(ipv4: uint32): string = var n = ipv4 for i in 1..4: result = $(n mod 256) & result if i != 4: result = '.' & result n = n.floorDiv 256u32 assert n == 0 func findZeroSeq(ipv6: array[8, uint16]): int = var maxi = -1 var maxn = 0 var newi = -1 var newn = 1 for i in low(ipv6)..high(ipv6): if ipv6[i] == 0: inc newn if newi == -1: newi = i else: if newn > maxn: maxn = newn maxi = newi newn = 0 newi = -1 if newn > maxn: return newi return maxi func serializeip(ipv6: array[8, uint16]): string = let compress = findZeroSeq(ipv6) var ignore0 = false for i in low(ipv6)..high(ipv6): if ignore0: if ipv6[i] == 0: continue else: ignore0 = false if i == compress: if i == 0: result &= "::" else: result &= ':' ignore0 = true continue result &= toHex(ipv6[i]) if i != high(ipv6): result &= ':' func serialize(host: Host): string = if host.ipv4.issome: return serializeip(host.ipv4.get) if host.ipv6.issome: return "[" & serializeip(host.ipv6.get) & "]" if host.opaquehost != "": return host.opaquehost return host.domain func serialize*(path: UrlPath): string {.inline.} = if path.opaque: return path.s for s in path.ss: result &= '/' result &= s func serialize_unicode*(path: UrlPath): string {.inline.} = if path.opaque: return percentDecode(path.s) for s in path.ss: result &= '/' result &= percentDecode(s) func serialize_unicode_dos*(path: UrlPath): string {.inline.} = if path.opaque: return percentDecode(path.s) var i = 0 if i < path.ss.len: if path.ss[i].is_windows_drive_letter: result &= path.ss[i] inc i while i < path.ss.len: let s = path.ss[i] result &= '\\' result &= percentDecode(s) inc i func serialize*(url: Url, excludefragment = false): string = result = url.scheme & ':' if url.host.issome: result &= "//" if url.includes_credentials: result &= url.username if url.password != "": result &= ':' & url.password result &= '@' result &= url.host.get.serialize if url.port.issome: result &= ':' & $url.port.get elif not url.path.opaque and url.path.ss.len > 1 and url.path.ss[0] == "": result &= "/." result &= url.path.serialize() if url.query.issome: result &= '?' & url.query.get if not excludefragment and url.fragment.issome: result &= '#' & url.fragment.get func serialize*(url: Option[Url], excludefragment = false): string = if url.isnone: return "" return url.get.serialize() func equals*(a, b: Url, excludefragment = false): bool = return a.serialize(excludefragment) == b.serialize(excludefragment) func `$`*(url: Url): string {.inline.} = url.serialize() func `$`*(path: UrlPath): string {.inline.} = path.serialize()