diff options
author | rumpf_a@web.de <> | 2010-01-30 20:05:29 +0100 |
---|---|---|
committer | rumpf_a@web.de <> | 2010-01-30 20:05:29 +0100 |
commit | d318f2eb358a6bf6f2e93c1618fd824d0d8368fe (patch) | |
tree | 826122e20a5f34ebef27af744344dd912247c7f7 /lib/devel | |
parent | e20293818cb49af9dac1280af793f623a2f69707 (diff) | |
download | Nim-d318f2eb358a6bf6f2e93c1618fd824d0d8368fe.tar.gz |
progress for httpclient
Diffstat (limited to 'lib/devel')
-rwxr-xr-x | lib/devel/httpclient.nim | 246 | ||||
-rwxr-xr-x | lib/devel/parseurl.nim | 149 | ||||
-rw-r--r-- | lib/devel/parseutils.nim | 63 |
3 files changed, 289 insertions, 169 deletions
diff --git a/lib/devel/httpclient.nim b/lib/devel/httpclient.nim index abea34ea6..391a037f7 100755 --- a/lib/devel/httpclient.nim +++ b/lib/devel/httpclient.nim @@ -1,26 +1,113 @@ -import sockets, strutils, parseurl, pegs +# +# +# Nimrod's Runtime Library +# (c) Copyright 2010 Dominik Picheta, Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements a simple HTTP client that can be used to retrieve +## webpages/other data. + +# neuer Code: +import sockets, strutils, parseurl, pegs, os, parseutils type - response = tuple[version: string, status: string, headers: seq[header], body: string] - header = tuple[htype: string, hvalue: string] + TResponse* = tuple[ + version: string, status: string, headers: seq[THeader], + body: string] + THeader* = tuple[htype: string, hvalue: string] EInvalidHttp* = object of EBase ## exception that is raised when server does ## not conform to the implemented HTTP ## protocol + EHttpRequestErr* = object of EBase ## Thrown in the ``getContent`` proc, + ## when the server returns an error + +template newException(exceptn, message: expr): expr = + block: # open a new scope + var + e: ref exceptn + new(e) + e.msg = message + e + proc httpError(msg: string) = var e: ref EInvalidHttp new(e) e.msg = msg raise e + +proc fileError(msg: string) = + var e: ref EIO + new(e) + e.msg = msg + raise e + +proc getHeaderValue*(headers: seq[THeader], name: string): string = + ## Retrieves a header by ``name``, from ``headers``. + ## Returns "" if a header is not found + for i in low(headers)..high(headers): + if cmpIgnoreCase(headers[i].htype, name) == 0: + return headers[i].hvalue + return "" + +proc parseBody(data: var string, start: int, s: TSocket, + headers: seq[THeader]): string = + if getHeaderValue(headers, "Transfer-Encoding") == "chunked": + # get chunks: + var i = start + result = "" + while true: + var chunkSize = 0 + var j = parseHex(data, chunkSize, i) + if j <= 0: break + inc(i, j) + while data[i] notin {'\C', '\L', '\0'}: inc(i) + if data[i] == '\C': inc(i) + if data[i] == '\L': inc(i) + if chunkSize <= 0: break + result.add(copy(data, i, i+chunkSize-1)) + if i + chunkSize > data.len: + echo "i: ", i, " size: ", chunkSize, " len: ", data.len + + assert(i + chunkSize <= data.len) + i = i + chunkSize + # skip trailing CR-LF: + #if data[i] == '\C': inc(i) + #if data[i] == '\L': inc(i) + + echo "came here" + data.add(s.recv()) + else: + result = copy(data, start) + # -REGION- Content-Length + # (http://tools.ietf.org/html/rfc2616#section-4.4) NR.3 + var contentLengthHeader = getHeaderValue(headers, "Content-Length") + if contentLengthHeader != "": + var length = contentLengthHeader.parseint() + while result.len() < length: result.add(s.recv()) + else: + # (http://tools.ietf.org/html/rfc2616#section-4.4) NR.4 TODO + + # -REGION- Connection: Close + # (http://tools.ietf.org/html/rfc2616#section-4.4) NR.5 + if getHeaderValue(headers, "Connection") == "close": + while True: + var moreData = recv(s) + if moreData.len == 0: break + result.add(moreData) -proc parseResponse(data: string): response = +proc parseResponse(s: TSocket): TResponse = + var data = s.recv() var i = 0 - #Parse the version - #Parses the first line of the headers - #``HTTP/1.1`` 200 OK - + # Parse the version + # Parses the first line of the headers + # ``HTTP/1.1`` 200 OK + var matches: array[0..1, string] var L = data.matchLen(peg"\i 'HTTP/' {'1.1'/'1.0'} \s+ {(!\n .)*}\n", matches, i) @@ -30,9 +117,9 @@ proc parseResponse(data: string): response = result.status = matches[1] inc(i, L) - #Parse the headers - #Everything after the first line leading up to the body - #htype: hvalue + # Parse the headers + # Everything after the first line leading up to the body + # htype: hvalue result.headers = @[] while true: @@ -42,7 +129,7 @@ proc parseResponse(data: string): response = key.add(data[i]) inc(i) inc(i) # skip ':' - if data[i] == ' ': inc(i) + if data[i] == ' ': inc(i) # skip if the character is a space var val = "" while data[i] notin {'\C', '\L', '\0'}: val.add(data[i]) @@ -59,58 +146,9 @@ proc parseResponse(data: string): response = inc(i) break - #Parse the body - #Everything after the headers(The first double CRLF) - result.body = data.copy(i) - + result.body = parseBody(data, i, s, result.headers) -proc readChunked(data: var string, s: TSocket): response = - #Read data from socket until the terminating chunk size is found(0\c\L\c\L) - while true: - data.add(s.recv()) - #Contains because - #trailers might be present - #after the terminating chunk size - if data.contains("0\c\L\c\L"): - break - - result = parseResponse(data) #Re-parse the body - - var count, length, chunkLength: int = 0 - var newBody: string = "" - var bodySplit: seq[string] = result.body.splitLines() - #Remove the chunks - for i in items(bodySplit): - if count == 1: #Get the first chunk size - chunkLength = ParseHexInt(i) - i.len() - 1 - else: - if length >= chunkLength: - #The chunk size determines how much text is left - #Until the next chunk size - chunkLength = ParseHexInt(i) - length = 0 - else: - #Break if the terminating chunk size is found - #This should ignore the `trailers` - if bodySplit[count] == "0": #This might cause problems... - break - - #Add the text to the newBody - newBody.add(i & "\c\L") - length = length + i.len() - inc(count) - #Make the parsed body the new body - result.body = newBody - -proc getHeaderValue*(headers: seq[header], name: string): string = - ## Retrieves a header by ``name``, from ``headers``. - ## Returns "" if a header is not found - for i in low(headers)..high(headers): - if cmpIgnoreCase(headers[i].htype, name) == 0: - return headers[i].hvalue - return "" - -proc request*(url: string): response = +proc request*(url: string): TResponse = var r = parse(url) var headers: string @@ -119,58 +157,46 @@ proc request*(url: string): response = else: headers = "GET / HTTP/1.1\c\L" - headers = headers & "Host: " & r.subdomain & r.domain & "\c\L\c\L" - + add(headers, "Host: " & r.hostname & "\c\L\c\L") + var s = socket() - s.connect(r.subdomain & r.domain, TPort(80)) + s.connect(r.hostname, TPort(80)) s.send(headers) + result = parseResponse(s) + s.close() - var data = s.recv() - - result = parseResponse(data) - - #-REGION- Transfer-Encoding - #-Takes precedence over Content-Length - #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.2 - var transferEncodingHeader = getHeaderValue(result.headers, "Transfer-Encoding") - if transferEncodingHeader == "chunked": - result = readChunked(data, s) +proc redirection(status: string): bool = + const redirectionNRs = ["301", "302", "303", "307"] + for i in items(redirectionNRs): + if status.startsWith(i): + return True - #-REGION- Content-Length - #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.3 - var contentLengthHeader = getHeaderValue(result.headers, "Content-Length") - if contentLengthHeader != "": - var length = contentLengthHeader.parseint() - - while data.len() < length: - data.add(s.recv()) - - result = parseResponse(data) - - #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.4 TODO - - #-REGION- Connection: Close - #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.5 - var connectionHeader = getHeaderValue(result.headers, "Connection") - if connectionHeader == "close": - while True: - var nD = s.recv() - if nD == "": break - data.add(nD) - result = parseResponse(data) - - s.close() - -proc get*(url: string): response = +proc get*(url: string, maxRedirects = 5): TResponse = + ## low-level proc similar to ``request`` which handles redirection result = request(url) + for i in 1..maxRedirects: + if result.status.redirection(): + var locationHeader = getHeaderValue(result.headers, "Location") + if locationHeader == "": httpError("location header expected") + result = request(locationHeader) + +proc getContent*(url: string): string = + ## GET's the body and returns it as a string + ## Raises exceptions for the status codes ``4xx`` and ``5xx`` + var r = get(url) + if r.status[0] in {'4','5'}: + raise newException(EHTTPRequestErr, r.status) + else: + return r.body +proc downloadFile*(url: string, outputFilename: string) = + var f: TFile + if open(f, outputFilename, fmWrite): + f.write(getContent(url)) + f.close() + else: + fileError("Unable to open file") -var r = get("http://www.google.co.uk/index.html") -#var r = get("http://www.crunchyroll.com") -echo("===================================") -echo(r.version & " " & r.status) -for htype, hvalue in items(r.headers): - echo(htype, ": ", hvalue) -echo("---------------------------------") -echo(r.body) \ No newline at end of file +when isMainModule: + downloadFile("http://www.google.com", "GoogleTest.txt") diff --git a/lib/devel/parseurl.nim b/lib/devel/parseurl.nim index 769d07561..756d5a891 100755 --- a/lib/devel/parseurl.nim +++ b/lib/devel/parseurl.nim @@ -1,64 +1,95 @@ -import regexprs, strutils +# +# +# Nimrod's Runtime Library +# (c) Copyright 2010 Dominik Picheta +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# -type - TURL* = tuple[protocol, username, password, - subdomain, domain, port, path, query, anchor: string] +## Parses & constructs URLs. + +import strutils +type + TURL* = tuple[ ## represents a *Uniform Resource Locator* (URL) + ## any optional component is "" if it does not exist + scheme, username, password, + hostname, port, path, query, anchor: string] + proc parse*(url: string): TURL = - const pattern = r"([a-zA-Z]+://)?(.+@)?(.+\.)?(\w+)(\.\w+)(:[0-9]+)?(/.+)?" - var m: array[0..7, string] #Array with the matches - discard regexprs.match(url, pattern, m) - - var msplit = m[2].split(':') + var i: int = 0 + + var scheme, username, password: string = "" + var hostname, port, path, query, anchor: string = "" + + var temp: string = "" + + if url[i] != '/': #url isn't a relative path + while True: + #Scheme + if url[i] == ':': + if url[i+1] == '/' and url[i+2] == '/': + scheme = temp + temp = "" + inc(i, 3) #Skip the // + #Authority(username, password) + if url[i] == '@': + username = temp.split(':')[0] + if temp.split(':').len() > 1: + password = temp.split(':')[1] + temp = "" + inc(i) #Skip the @ + #hostname(subdomain, domain, port) + if url[i] == '/' or url[i] == '\0': + #TODO + hostname = temp + if hostname.split(':').len() > 1: + port = hostname.split(':')[1] + hostname = hostname.split(':')[0] + + temp = "" + break + + temp.add(url[i]) + inc(i) - var username: string = "" - var password: string = "" - if m[2] != "": - username = msplit[0] - if msplit.len() == 2: - password = msplit[1].replace("@", "") + #Path + while True: + if url[i] == '?': + path = temp + temp = "" + if url[i] == '#': + if temp[0] == '?': + query = temp + else: + path = temp + temp = "" + + if url[i] == '\0': + if temp[0] == '?': + query = temp + elif temp[0] == '#': + anchor = temp + else: + path = temp + break + + temp.add(url[i]) + inc(i) + + return (scheme, username, password, hostname, port, path, query, anchor) - var path: string = "" - var query: string = "" - var anchor: string = "" - - if m[7] != nil: - msplit = m[7].split('?') - path = msplit[0] - query = "" - anchor = "" - if msplit.len() == 2: - query = "?" & msplit[1] - - msplit = path.split('#') - if msplit.len() == 2: - anchor = "#" & msplit[1] - path = msplit[0] - msplit = query.split('#') - if msplit.len() == 2: - anchor = "#" & msplit[1] - query = msplit[0] - - result = (protocol: m[1], username: username, password: password, - subdomain: m[3], domain: m[4] & m[5], port: m[6], path: path, query: query, anchor: anchor) - -when isMainModule: - proc test(r: TURL) = - echo("protocol=" & r.protocol) - echo("username=" & r.username) - echo("password=" & r.password) - echo("subdomain=" & r.subdomain) - echo("domain=" & r.domain) - echo("port=" & r.port) - echo("path=" & r.path) - echo("query=" & r.query) - echo("anchor=" & r.anchor) - echo("---------------") - - var r: TURL - r = parse(r"http://google.co.uk/search?var=bleahdhsad") - test(r) - r = parse(r"http://dom96:test@google.com:80/search.php?q=562gs6&foo=6gs6&bar=7hs6#test") - test(r) - r = parse(r"http://www.google.co.uk/search?q=multiple+subdomains&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:pl:official&client=firefox-a") - test(r) \ No newline at end of file +proc `$`*(t: TURL): string = + result = "" + if t.scheme != "": result.add(t.scheme & "://") + if t.username != "": + if t.password != "": + result.add(t.username & ":" & t.password & "@") + else: + result.add(t.username & "@") + if t.hostname != "": result.add(t.hostname) + if t.port != "": result.add(":" & t.port) + if t.path != "": result.add(t.path) + if t.query != "": result.add(t.query) + if t.anchor != "": result.add(t.anchor) diff --git a/lib/devel/parseutils.nim b/lib/devel/parseutils.nim new file mode 100644 index 000000000..4c5152167 --- /dev/null +++ b/lib/devel/parseutils.nim @@ -0,0 +1,63 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2010 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## Helpers for parsing. + +import strutils + +proc parseHex*(s: string, number: var int, start = 0): int = + ## parses a hexadecimal number and stores its value in ``number``. Returns + ## the number of the parsed characters or 0 in case of an error. + var i = start + var foundDigit = false + if s[i] == '0' and (s[i+1] == 'x' or s[i+1] == 'X'): inc(i, 2) + elif s[i] == '#': inc(i) + while true: + case s[i] + of '_': nil + of '0'..'9': + number = number shl 4 or (ord(s[i]) - ord('0')) + foundDigit = true + of 'a'..'f': + number = number shl 4 or (ord(s[i]) - ord('a') + 10) + foundDigit = true + of 'A'..'F': + number = number shl 4 or (ord(s[i]) - ord('A') + 10) + foundDigit = true + else: break + inc(i) + if foundDigit: result = i-start + +proc parseIdent*(s: string, ident: var string, start = 0): int = + ## parses an identifier and stores it in ``ident``. Returns + ## the number of the parsed characters or 0 in case of an error. + var i = start + if s[i] in IdentStartChars: + inc(i) + while s[i] in IdentChars: inc(i) + ident = copy(s, start, i-1) + result = i-start + +proc skipWhitespace*(s: string, start = 0): int {.inline.} = + while s[start+result] in Whitespace: inc(result) + +proc skip*(s, token: string, start = 0): int = + while result < token.len and s[result+start] == token[result]: inc(result) + if result != token.len: result = 0 + +proc skipIgnoreCase*(s, token: string, start = 0): int = + while result < token.len and + toLower(s[result+start]) == toLower(token[result]): inc(result) + if result != token.len: result = 0 + +proc parseBiggestInt*(s: string, number: var biggestInt, start = 0): int = + assert(false) # to implement + +proc parseBiggestFloat*(s: string, number: var biggestFloat, start = 0): int = + assert(false) # to implement |