summary refs log tree commit diff stats
path: root/lib/pure/uri.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/uri.nim')
-rw-r--r--lib/pure/uri.nim572
1 files changed, 572 insertions, 0 deletions
diff --git a/lib/pure/uri.nim b/lib/pure/uri.nim
new file mode 100644
index 000000000..725d5bbd9
--- /dev/null
+++ b/lib/pure/uri.nim
@@ -0,0 +1,572 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2015 Dominik Picheta
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements URI parsing as specified by RFC 3986.
+##
+## A Uniform Resource Identifier (URI) provides a simple and extensible
+## means for identifying a resource. A URI can be further classified
+## as a locator, a name, or both. The term "Uniform Resource Locator"
+## (URL) refers to the subset of URIs.
+##
+## .. warning:: URI parsers in this module do not perform security validation.
+##
+## # Basic usage
+
+
+## ## Combine URIs
+runnableExamples:
+  let host = parseUri("https://nim-lang.org")
+  assert $host == "https://nim-lang.org"
+  assert $(host / "/blog.html") == "https://nim-lang.org/blog.html"
+  assert $(host / "blog2.html") == "https://nim-lang.org/blog2.html"
+
+## ## Access URI item
+runnableExamples:
+  let res = parseUri("sftp://127.0.0.1:4343")
+  assert isAbsolute(res)
+  assert res.port == "4343"
+
+## ## Data URI Base64
+runnableExamples:
+  assert getDataUri("Hello World", "text/plain") == "data:text/plain;charset=utf-8;base64,SGVsbG8gV29ybGQ="
+  assert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt"
+
+
+import std/[strutils, parseutils, base64]
+import std/private/[since, decode_helpers]
+
+when defined(nimPreviewSlimSystem):
+  import std/assertions
+
+
+type
+  Url* = distinct string
+
+  Uri* = object
+    scheme*, username*, password*: string
+    hostname*, port*, path*, query*, anchor*: string
+    opaque*: bool
+    isIpv6*: bool
+
+  UriParseError* = object of ValueError
+
+
+proc uriParseError*(msg: string) {.noreturn.} =
+  ## Raises a `UriParseError` exception with message `msg`.
+  raise newException(UriParseError, msg)
+
+func encodeUrl*(s: string, usePlus = true): string =
+  ## Encodes a URL according to RFC3986.
+  ##
+  ## This means that characters in the set
+  ## `{'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'}` are
+  ## carried over to the result.
+  ## All other characters are encoded as `%xx` where `xx`
+  ## denotes its hexadecimal value.
+  ##
+  ## As a special rule, when the value of `usePlus` is true,
+  ## spaces are encoded as `+` instead of `%20`.
+  ##
+  ## **See also:**
+  ## * `decodeUrl func<#decodeUrl,string>`_
+  runnableExamples:
+    assert encodeUrl("https://nim-lang.org") == "https%3A%2F%2Fnim-lang.org"
+    assert encodeUrl("https://nim-lang.org/this is a test") == "https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test"
+    assert encodeUrl("https://nim-lang.org/this is a test", false) == "https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test"
+  result = newStringOfCap(s.len + s.len shr 2) # assume 12% non-alnum-chars
+  let fromSpace = if usePlus: "+" else: "%20"
+  for c in s:
+    case c
+    # https://tools.ietf.org/html/rfc3986#section-2.3
+    of 'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~': add(result, c)
+    of ' ': add(result, fromSpace)
+    else:
+      add(result, '%')
+      add(result, toHex(ord(c), 2))
+
+func decodeUrl*(s: string, decodePlus = true): string =
+  ## Decodes a URL according to RFC3986.
+  ##
+  ## This means that any `%xx` (where `xx` denotes a hexadecimal
+  ## value) are converted to the character with ordinal number `xx`,
+  ## and every other character is carried over.
+  ## If `xx` is not a valid hexadecimal value, it is left intact.
+  ##
+  ## As a special rule, when the value of `decodePlus` is true, `+`
+  ## characters are converted to a space.
+  ##
+  ## **See also:**
+  ## * `encodeUrl func<#encodeUrl,string>`_
+  runnableExamples:
+    assert decodeUrl("https%3A%2F%2Fnim-lang.org") == "https://nim-lang.org"
+    assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test") == "https://nim-lang.org/this is a test"
+    assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test",
+        false) == "https://nim-lang.org/this is a test"
+    assert decodeUrl("abc%xyz") == "abc%xyz"
+
+  result = newString(s.len)
+  var i = 0
+  var j = 0
+  while i < s.len:
+    case s[i]
+    of '%':
+      result[j] = decodePercent(s, i)
+    of '+':
+      if decodePlus:
+        result[j] = ' '
+      else:
+        result[j] = s[i]
+    else: result[j] = s[i]
+    inc(i)
+    inc(j)
+  setLen(result, j)
+
+func encodeQuery*(query: openArray[(string, string)], usePlus = true,
+    omitEq = true, sep = '&'): string =
+  ## Encodes a set of (key, value) parameters into a URL query string.
+  ##
+  ## Every (key, value) pair is URL-encoded and written as `key=value`. If the
+  ## value is an empty string then the `=` is omitted, unless `omitEq` is
+  ## false.
+  ## The pairs are joined together by the `sep` character.
+  ##
+  ## The `usePlus` parameter is passed down to the `encodeUrl` function that
+  ## is used for the URL encoding of the string values.
+  ##
+  ## **See also:**
+  ## * `encodeUrl func<#encodeUrl,string>`_
+  runnableExamples:
+    assert encodeQuery({: }) == ""
+    assert encodeQuery({"a": "1", "b": "2"}) == "a=1&b=2"
+    assert encodeQuery({"a": "1", "b": ""}) == "a=1&b"
+    assert encodeQuery({"a": "1", "b": ""}, omitEq = false, sep = ';') == "a=1;b="
+  for elem in query:
+    # Encode the `key = value` pairs and separate them with 'sep'
+    if result.len > 0: result.add(sep)
+    let (key, val) = elem
+    result.add(encodeUrl(key, usePlus))
+    # Omit the '=' if the value string is empty
+    if not omitEq or val.len > 0:
+      result.add('=')
+      result.add(encodeUrl(val, usePlus))
+
+iterator decodeQuery*(data: string, sep = '&'): tuple[key, value: string] =
+  ## Reads and decodes the query string `data` and yields the `(key, value)` pairs
+  ## the data consists of. If compiled with `-d:nimLegacyParseQueryStrict`,
+  ## a `UriParseError` is raised when there is an unencoded `=` character in a decoded
+  ## value, which was the behavior in Nim < 1.5.1.
+  runnableExamples:
+    import std/sequtils
+    assert toSeq(decodeQuery("foo=1&bar=2=3")) == @[("foo", "1"), ("bar", "2=3")]
+    assert toSeq(decodeQuery("foo=1;bar=2=3", ';')) == @[("foo", "1"), ("bar", "2=3")]
+    assert toSeq(decodeQuery("&a&=b&=&&")) == @[("", ""), ("a", ""), ("", "b"), ("", ""), ("", "")]
+
+  proc parseData(data: string, i: int, field: var string, sep: char): int =
+    result = i
+    while result < data.len:
+      let c = data[result]
+      case c
+      of '%': add(field, decodePercent(data, result))
+      of '+': add(field, ' ')
+      of '&': break
+      else:
+        if c == sep: break
+        else: add(field, data[result])
+      inc(result)
+
+  var i = 0
+  var name = ""
+  var value = ""
+  # decode everything in one pass:
+  while i < data.len:
+    setLen(name, 0) # reuse memory
+    i = parseData(data, i, name, '=')
+    setLen(value, 0) # reuse memory
+    if i < data.len and data[i] == '=':
+      inc(i) # skip '='
+      when defined(nimLegacyParseQueryStrict):
+        i = parseData(data, i, value, '=')
+      else:
+        i = parseData(data, i, value, sep)
+    yield (name, value)
+    if i < data.len:
+      when defined(nimLegacyParseQueryStrict):
+        if data[i] != '&':
+          uriParseError("'&' expected at index '$#' for '$#'" % [$i, data])
+      inc(i)
+
+func parseAuthority(authority: string, result: var Uri) =
+  var i = 0
+  var inPort = false
+  var inIPv6 = false
+  while i < authority.len:
+    case authority[i]
+    of '@':
+      swap result.password, result.port
+      result.port.setLen(0)
+      swap result.username, result.hostname
+      result.hostname.setLen(0)
+      inPort = false
+    of ':':
+      if inIPv6:
+        result.hostname.add(authority[i])
+      else:
+        inPort = true
+    of '[':
+      inIPv6 = true
+      result.isIpv6 = true
+    of ']':
+      inIPv6 = false
+    else:
+      if inPort:
+        result.port.add(authority[i])
+      else:
+        result.hostname.add(authority[i])
+    i.inc
+
+func parsePath(uri: string, i: var int, result: var Uri) =
+  i.inc parseUntil(uri, result.path, {'?', '#'}, i)
+
+  # The 'mailto' scheme's PATH actually contains the hostname/username
+  if cmpIgnoreCase(result.scheme, "mailto") == 0:
+    parseAuthority(result.path, result)
+    result.path.setLen(0)
+
+  if i < uri.len and uri[i] == '?':
+    i.inc # Skip '?'
+    i.inc parseUntil(uri, result.query, {'#'}, i)
+
+  if i < uri.len and uri[i] == '#':
+    i.inc # Skip '#'
+    i.inc parseUntil(uri, result.anchor, {}, i)
+
+func initUri*(isIpv6 = false): Uri =
+  ## Initializes a URI with `scheme`, `username`, `password`,
+  ## `hostname`, `port`, `path`, `query`, `anchor` and `isIpv6`.
+  ##
+  ## **See also:**
+  ## * `Uri type <#Uri>`_ for available fields in the URI type
+  runnableExamples:
+    var uri2 = initUri(isIpv6 = true)
+    uri2.scheme = "tcp"
+    uri2.hostname = "2001:0db8:85a3:0000:0000:8a2e:0370:7334"
+    uri2.port = "8080"
+    assert $uri2 == "tcp://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080"
+  result = Uri(scheme: "", username: "", password: "", hostname: "", port: "",
+                path: "", query: "", anchor: "", isIpv6: isIpv6)
+
+func resetUri(uri: var Uri) =
+  for f in uri.fields:
+    when f is string:
+      f.setLen(0)
+    else:
+      f = false
+
+func parseUri*(uri: string, result: var Uri) =
+  ## Parses a URI. The `result` variable will be cleared before.
+  ##
+  ## **See also:**
+  ## * `Uri type <#Uri>`_ for available fields in the URI type
+  ## * `initUri func <#initUri>`_ for initializing a URI
+  runnableExamples:
+    var res = initUri()
+    parseUri("https://nim-lang.org/docs/manual.html", res)
+    assert res.scheme == "https"
+    assert res.hostname == "nim-lang.org"
+    assert res.path == "/docs/manual.html"
+  resetUri(result)
+
+  var i = 0
+
+  # Check if this is a reference URI (relative URI)
+  let doubleSlash = uri.len > 1 and uri[0] == '/' and uri[1] == '/'
+  if i < uri.len and uri[i] == '/':
+    # Make sure `uri` doesn't begin with '//'.
+    if not doubleSlash:
+      parsePath(uri, i, result)
+      return
+
+  # Scheme
+  i.inc parseWhile(uri, result.scheme, Letters + Digits + {'+', '-', '.'}, i)
+  if (i >= uri.len or uri[i] != ':') and not doubleSlash:
+    # Assume this is a reference URI (relative URI)
+    i = 0
+    result.scheme.setLen(0)
+    parsePath(uri, i, result)
+    return
+  if not doubleSlash:
+    i.inc # Skip ':'
+
+  # Authority
+  if i+1 < uri.len and uri[i] == '/' and uri[i+1] == '/':
+    i.inc(2) # Skip //
+    var authority = ""
+    i.inc parseUntil(uri, authority, {'/', '?', '#'}, i)
+    if authority.len > 0:
+      parseAuthority(authority, result)
+  else:
+    result.opaque = true
+
+  # Path
+  parsePath(uri, i, result)
+
+func parseUri*(uri: string): Uri =
+  ## Parses a URI and returns it.
+  ##
+  ## **See also:**
+  ## * `Uri type <#Uri>`_ for available fields in the URI type
+  runnableExamples:
+    let res = parseUri("ftp://Username:Password@Hostname")
+    assert res.username == "Username"
+    assert res.password == "Password"
+    assert res.scheme == "ftp"
+  result = initUri()
+  parseUri(uri, result)
+
+func removeDotSegments(path: string): string =
+  ## Collapses `..` and `.` in `path` in a similar way as done in `os.normalizedPath`
+  ## Caution: this is buggy.
+  runnableExamples:
+    assert removeDotSegments("a1/a2/../a3/a4/a5/./a6/a7/.//./") == "a1/a3/a4/a5/a6/a7/"
+    assert removeDotSegments("http://www.ai.") == "http://www.ai."
+  # xxx adapt or reuse `pathnorm.normalizePath(path, '/')` to make this more reliable, but
+  # taking into account url specificities such as not collapsing leading `//` in scheme
+  # `https://`. see `turi` for failing tests.
+  if path.len == 0: return ""
+  var collection: seq[string] = @[]
+  let endsWithSlash = path.endsWith '/'
+  var i = 0
+  var currentSegment = ""
+  while i < path.len:
+    case path[i]
+    of '/':
+      collection.add(currentSegment)
+      currentSegment = ""
+    of '.':
+      if i+2 < path.len and path[i+1] == '.' and path[i+2] == '/':
+        if collection.len > 0:
+          discard collection.pop()
+          i.inc 3
+          continue
+      elif i + 1 < path.len and path[i+1] == '/':
+        i.inc 2
+        continue
+      currentSegment.add path[i]
+    else:
+      currentSegment.add path[i]
+    i.inc
+  if currentSegment != "":
+    collection.add currentSegment
+
+  result = collection.join("/")
+  if endsWithSlash: result.add '/'
+
+func merge(base, reference: Uri): string =
+  # http://tools.ietf.org/html/rfc3986#section-5.2.3
+  if base.hostname != "" and base.path == "":
+    '/' & reference.path
+  else:
+    let lastSegment = rfind(base.path, "/")
+    if lastSegment == -1:
+      reference.path
+    else:
+      base.path[0 .. lastSegment] & reference.path
+
+func combine*(base: Uri, reference: Uri): Uri =
+  ## Combines a base URI with a reference URI.
+  ##
+  ## This uses the algorithm specified in
+  ## `section 5.2.2 of RFC 3986 <http://tools.ietf.org/html/rfc3986#section-5.2.2>`_.
+  ##
+  ## This means that the slashes inside the base URIs path as well as reference
+  ## URIs path affect the resulting URI.
+  ##
+  ## **See also:**
+  ## * `/ func <#/,Uri,string>`_ for building URIs
+  runnableExamples:
+    let foo = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("/baz"))
+    assert foo.path == "/baz"
+    let bar = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("baz"))
+    assert bar.path == "/foo/baz"
+    let qux = combine(parseUri("https://nim-lang.org/foo/bar/"), parseUri("baz"))
+    assert qux.path == "/foo/bar/baz"
+
+  template setAuthority(dest, src): untyped =
+    dest.hostname = src.hostname
+    dest.username = src.username
+    dest.port = src.port
+    dest.password = src.password
+
+  result = initUri()
+  if reference.scheme != base.scheme and reference.scheme != "":
+    result = reference
+    result.path = removeDotSegments(result.path)
+  else:
+    if reference.hostname != "":
+      setAuthority(result, reference)
+      result.path = removeDotSegments(reference.path)
+      result.query = reference.query
+    else:
+      if reference.path == "":
+        result.path = base.path
+        if reference.query != "":
+          result.query = reference.query
+        else:
+          result.query = base.query
+      else:
+        if reference.path.startsWith("/"):
+          result.path = removeDotSegments(reference.path)
+        else:
+          result.path = removeDotSegments(merge(base, reference))
+        result.query = reference.query
+      setAuthority(result, base)
+    result.scheme = base.scheme
+  result.anchor = reference.anchor
+
+func combine*(uris: varargs[Uri]): Uri =
+  ## Combines multiple URIs together.
+  ##
+  ## **See also:**
+  ## * `/ func <#/,Uri,string>`_ for building URIs
+  runnableExamples:
+    let foo = combine(parseUri("https://nim-lang.org/"), parseUri("docs/"),
+        parseUri("manual.html"))
+    assert foo.hostname == "nim-lang.org"
+    assert foo.path == "/docs/manual.html"
+  result = uris[0]
+  for i in 1 ..< uris.len:
+    result = combine(result, uris[i])
+
+func isAbsolute*(uri: Uri): bool =
+  ## Returns true if URI is absolute, false otherwise.
+  runnableExamples:
+    assert parseUri("https://nim-lang.org").isAbsolute
+    assert not parseUri("nim-lang").isAbsolute
+  return uri.scheme != "" and (uri.hostname != "" or uri.path != "")
+
+func `/`*(x: Uri, path: string): Uri =
+  ## Concatenates the path specified to the specified URIs path.
+  ##
+  ## Contrary to the `combine func <#combine,Uri,Uri>`_ you do not have to worry about
+  ## the slashes at the beginning and end of the path and URIs path
+  ## respectively.
+  ##
+  ## **See also:**
+  ## * `combine func <#combine,Uri,Uri>`_
+  runnableExamples:
+    let foo = parseUri("https://nim-lang.org/foo/bar") / "/baz"
+    assert foo.path == "/foo/bar/baz"
+    let bar = parseUri("https://nim-lang.org/foo/bar") / "baz"
+    assert bar.path == "/foo/bar/baz"
+    let qux = parseUri("https://nim-lang.org/foo/bar/") / "baz"
+    assert qux.path == "/foo/bar/baz"
+  result = x
+
+  if result.path.len == 0:
+    if path.len == 0 or path[0] != '/':
+      result.path = "/"
+    result.path.add(path)
+    return
+
+  if result.path.len > 0 and result.path[result.path.len-1] == '/':
+    if path.len > 0 and path[0] == '/':
+      result.path.add(path[1 .. path.len-1])
+    else:
+      result.path.add(path)
+  else:
+    if path.len == 0 or path[0] != '/':
+      result.path.add '/'
+    result.path.add(path)
+
+func `?`*(u: Uri, query: openArray[(string, string)]): Uri =
+  ## Concatenates the query parameters to the specified URI object.
+  runnableExamples:
+    let foo = parseUri("https://example.com") / "foo" ? {"bar": "qux"}
+    assert $foo == "https://example.com/foo?bar=qux"
+  result = u
+  result.query = encodeQuery(query)
+
+func `$`*(u: Uri): string =
+  ## Returns the string representation of the specified URI object.
+  runnableExamples:
+    assert $parseUri("https://nim-lang.org") == "https://nim-lang.org"
+  # Get the len of all the parts.
+  let schemeLen = u.scheme.len
+  let usernameLen = u.username.len
+  let passwordLen = u.password.len
+  let hostnameLen = u.hostname.len
+  let portLen = u.port.len
+  let pathLen = u.path.len
+  let queryLen = u.query.len
+  let anchorLen = u.anchor.len
+  # Prepare a string that fits all the parts and all punctuation chars.
+  # 12 is the max len required by all possible punctuation chars.
+  result = newStringOfCap(
+    schemeLen + usernameLen + passwordLen + hostnameLen + portLen + pathLen + queryLen + anchorLen + 12
+  )
+  # Insert to result.
+  if schemeLen > 0:
+    result.add u.scheme
+    result.add ':'
+    if not u.opaque:
+      result.add '/'
+      result.add '/'
+  if usernameLen > 0:
+    result.add u.username
+    if passwordLen > 0:
+      result.add ':'
+      result.add u.password
+    result.add '@'
+  if u.hostname.endsWith('/'):
+    if u.isIpv6:
+      result.add '['
+      result.add u.hostname[0 .. ^2]
+      result.add ']'
+    else:
+      result.add u.hostname[0 .. ^2]
+  else:
+    if u.isIpv6:
+      result.add '['
+      result.add u.hostname
+      result.add ']'
+    else:
+      result.add u.hostname
+  if portLen > 0:
+    result.add ':'
+    result.add u.port
+  if pathLen > 0:
+    if hostnameLen > 0 and u.path[0] != '/':
+      result.add '/'
+    result.add u.path
+  if queryLen > 0:
+    result.add '?'
+    result.add u.query
+  if anchorLen > 0:
+    result.add '#'
+    result.add u.anchor
+
+
+proc getDataUri*(data, mime: string, encoding = "utf-8"): string {.since: (1, 3).} =
+  ## Convenience proc for `base64.encode` returns a standard Base64 Data URI (RFC-2397)
+  ##
+  ## **See also:**
+  ## * `mimetypes <mimetypes.html>`_ for `mime` argument
+  ## * https://tools.ietf.org/html/rfc2397
+  ## * https://en.wikipedia.org/wiki/Data_URI_scheme
+  runnableExamples: static: assert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt"
+  assert encoding.len > 0 and mime.len > 0 # Must *not* be URL-Safe, see RFC-2397
+  let base64encoded: string = base64.encode(data)
+  # ("data:".len + ";charset=".len + ";base64,".len) == 22
+  result = newStringOfCap(22 + mime.len + encoding.len + base64encoded.len)
+  result.add "data:"
+  result.add mime
+  result.add ";charset="
+  result.add encoding
+  result.add ";base64,"
+  result.add base64encoded