#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
## parser.
## The only encoding that is supported is UTF-8. The parser has been designed
## to be somewhat error correcting, so that even most "wild HTML" found on the
## web can be parsed with it. **Note:** This parser does not check that each
## ```` has a corresponding ````! These checks have do be
## implemented by the client code for various reasons:
##
## * Old HTML contains tags that have no end tag: ``
`` for example.
## * HTML tags are case insensitive, XML tags are case sensitive. Since this
## library can parse both, only the client knows which comparison is to be
## used.
## * Thus the checks would have been very difficult to implement properly with
## little benefit, especially since they are simple to implement in the
## client. The client should use the `errorMsgExpected` proc to generate
## a nice error message that fits the other error messages this library
## creates.
##
##
## Example 1: Retrieve HTML title
## ==============================
##
## The file ``examples/htmltitle.nim`` demonstrates how to use the
## XML parser to accomplish a simple task: To determine the title of an HTML
## document.
##
## .. code-block:: nimrod
## :file: examples/htmltitle.nim
##
##
## Example 2: Retrieve all HTML links
## ==================================
##
## The file ``examples/htmlrefs.nim`` demonstrates how to use the
## XML parser to accomplish another simple task: To determine all the links
## an HTML document contains.
##
## .. code-block:: nimrod
## :file: examples/htmlrefs.nim
##
import
hashes, strutils, lexbase, streams, unicode
# the parser treats ``
`` as ``
``
# xmlElementCloseEnd, ## ``/>``
type
TXmlEventKind* = enum ## enumation of all events that may occur when parsing
xmlError, ## an error ocurred during parsing
xmlEof, ## end of file reached
xmlCharData, ## character data
xmlWhitespace, ## whitespace has been parsed
xmlComment, ## a comment has been parsed
xmlPI, ## processing instruction (````)
xmlElementStart, ## ````
xmlElementEnd, ## ````
xmlElementOpen, ## ````
xmlCData, ## ````
xmlEntity, ## &entity;
xmlSpecial ## ````
TXmlError* = enum ## enumeration that lists all errors that can occur
errNone, ## no error
errEndOfCDataExpected, ## ``]]>`` expected
errNameExpected, ## name expected
errSemicolonExpected, ## ``;`` expected
errQmGtExpected, ## ``?>`` expected
errGtExpected, ## ``>`` expected
errEqExpected, ## ``=`` expected
errQuoteExpected, ## ``"`` or ``'`` expected
errEndOfCommentExpected ## ``-->`` expected
TParserState = enum
stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
TXmlParseOption* = enum ## options for the XML parser
reportWhitespace, ## report whitespace
reportComments ## report comments
TXmlParser* = object of TBaseLexer ## the parser object.
a, b, c: string
kind: TXmlEventKind
err: TXmlError
state: TParserState
filename: string
options: set[TXmlParseOption]
const
errorMessages: array [TXmlError, string] = [
"no error",
"']]>' expected",
"name expected",
"';' expected",
"'?>' expected",
"'>' expected",
"'=' expected",
"'\"' or \"'\" expected",
"'-->' expected"
]
proc open*(my: var TXmlParser, input: PStream, filename: string,
options: set[TXmlParseOption] = {}) =
## initializes the parser with an input stream. `Filename` is only used
## for nice error messages. The parser's behaviour can be controlled by
## the `options` parameter: If `options` contains ``reportWhitespace``
## a whitespace token is reported as an ``xmlWhitespace`` event.
## If `options` contains ``reportComments`` a comment token is reported as an
## ``xmlComment`` event.
lexbase.open(my, input)
my.filename = filename
my.state = stateStart
my.kind = xmlError
my.a = ""
my.b = ""
my.options = options
proc close*(my: var TXmlParser) {.inline.} =
## closes the parser `my` and its associated input stream.
lexbase.close(my)
proc charData*(my: TXmlParser): string {.inline.} =
## returns the character data for the events: ``xmlCharData``,
## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
xmlSpecial})
return my.a
proc kind*(my: TXmlParser): TXmlEventKind {.inline.} =
## returns the current event type for the XML parser
return my.kind
proc elementName*(my: TXmlParser): string {.inline.} =
## returns the element name for the events: ``xmlElementStart``,
## ``xmlElementEnd``, ``xmlElementOpen``
assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
return my.a
proc entityName*(my: TXmlParser): string {.inline.} =
## returns the entity name for the event: ``xmlEntity``
assert(my.kind == xmlEntity)
return my.a
proc attrKey*(my: TXmlParser): string {.inline.} =
## returns the attribute key for the event ``xmlAttribute``
assert(my.kind == xmlAttribute)
return my.a
proc attrValue*(my: TXmlParser): string {.inline.} =
## returns the attribute value for the event ``xmlAttribute``
assert(my.kind == xmlAttribute)
return my.b
proc PIName*(my: TXmlParser): string {.inline.} =
## returns the processing instruction name for the event ``xmlPI``
assert(my.kind == xmlPI)
return my.a
proc PIRest*(my: TXmlParser): string {.inline.} =
## returns the rest of the processing instruction for the event ``xmlPI``
assert(my.kind == xmlPI)
return my.b
proc getColumn*(my: TXmlParser): int {.inline.} =
## get the current column the parser has arrived at.
result = getColNumber(my, my.bufPos)
proc getLine*(my: TXmlParser): int {.inline.} =
## get the current line the parser has arrived at.
result = my.linenumber
proc getFilename*(my: TXmlParser): string {.inline.} =
## get the filename of the file that the parser processes.
result = my.filename
proc errorMsg*(my: TXmlParser): string =
## returns a helpful error message for the event ``xmlError``
assert(my.kind == xmlError)
result = "$1($2, $3) Error: $4" % [
my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
proc errorMsgExpected*(my: TXmlParser, tag: string): string =
## returns an error message " expected" in the same format as the
## other error messages
result = "$1($2, $3) Error: $4" % [
my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
proc errorMsg*(my: TXmlParser, msg: string): string =
## returns an error message with text `msg` in the same format as the
## other error messages
result = "$1($2, $3) Error: $4" % [
my.filename, $getLine(my), $getColumn(my), msg]
proc markError(my: var TXmlParser, kind: TXmlError) {.inline.} =
my.err = kind
my.state = stateError
proc parseCDATA(my: var TXMLParser) =
var pos = my.bufpos + len("':
inc(pos, 3)
break
add(my.a, ']')
inc(pos)
of '\0':
markError(my, errEndOfCDataExpected)
break
of '\c':
pos = lexbase.HandleCR(my, pos)
buf = my.buf
add(my.a, '\L')
of '\L':
pos = lexbase.HandleLF(my, pos)
buf = my.buf
add(my.a, '\L')
else:
add(my.a, buf[pos])
inc(pos)
my.bufpos = pos # store back
my.kind = xmlCDATA
proc parseComment(my: var TXMLParser) =
var pos = my.bufpos + len("" % x.charData)
of xmlPI: echo(" $1 ## $2 ?>" % [x.PIName, x.PIRest])
of xmlElementStart: echo("<$1>" % x.elementName)
of xmlElementEnd: echo("$1>" % x.elementName)
of xmlElementOpen: echo("<$1" % x.elementName)
of xmlAttribute:
echo("Key: " & x.attrKey)
echo("Value: " & x.attrValue)
of xmlElementClose: echo(">")
of xmlCData:
echo("" % x.charData)
of xmlEntity:
echo("&$1;" % x.entityName)
of xmlSpecial:
echo("SPECIAL: " & x.charData)
close(x)