summary refs log tree commit diff stats
path: root/lib/pure/parsexml.nim
diff options
context:
space:
mode:
authorAndreas Rumpf <rumpf_a@web.de>2009-06-08 08:06:25 +0200
committerAndreas Rumpf <rumpf_a@web.de>2009-06-08 08:06:25 +0200
commit4d4b3b1c04d41868ebb58bd9ccba7b303007e900 (patch)
tree909ed0aad0b145733521f4ac2bfb938dd4b43785 /lib/pure/parsexml.nim
parentce88dc3e67436939b03f97e624c11ca6058fedce (diff)
downloadNim-4d4b3b1c04d41868ebb58bd9ccba7b303007e900.tar.gz
version0.7.10
Diffstat (limited to 'lib/pure/parsexml.nim')
-rw-r--r--lib/pure/parsexml.nim635
1 files changed, 635 insertions, 0 deletions
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim
new file mode 100644
index 000000000..242a4f01c
--- /dev/null
+++ b/lib/pure/parsexml.nim
@@ -0,0 +1,635 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2009 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
+## parser. 
+## The only encoding that is supported is UTF-8. The parser has been designed
+## to be somewhat error correcting, so that even most "wild HTML" found on the 
+## web can be parsed with it. **Note:** This parser does not check that each
+## ``<tag>`` has a corresponding ``</tag>``! These checks have do be 
+## implemented by the client code for various reasons: 
+##
+## * Old HTML contains tags that have no end tag: ``<br>`` for example.
+## * HTML tags are case insensitive, XML tags are case sensitive. Since this
+##   library can parse both, only the client knows which comparison is to be
+##   used.
+## * Thus the checks would have been very difficult to implement properly with
+##   little benefit, especially since they are simple to implement in the 
+##   client. The client should use the `errorMsgExpected` proc to generate
+##   a nice error message that fits the other error messages this library
+##   creates.
+##
+##
+## Example 1: Retrieve HTML title
+## ==============================
+##
+## The file ``examples/htmltitle.nim`` demonstrates how to use the 
+## XML parser to accomplish a simple task: To determine the title of an HTML
+## document.
+##
+## .. code-block:: nimrod
+##     :file: examples/htmltitle.nim
+##
+##
+## Example 2: Retrieve all HTML links
+## ==================================
+##
+## The file ``examples/htmlrefs.nim`` demonstrates how to use the 
+## XML parser to accomplish another simple task: To determine all the links 
+## an HTML document contains.
+##
+## .. code-block:: nimrod
+##     :file: examples/htmlrefs.nim
+##
+
+import 
+  hashes, strutils, lexbase, streams, unicode
+
+# the parser treats ``<br />`` as ``<br></br>``
+
+type 
+  TXmlEventKind* = enum ## enumation of all events that may occur when parsing
+    xmlError,           ## an error ocurred during parsing
+    xmlEof,             ## end of file reached
+    xmlCharData,        ## character data
+    xmlWhitespace,      ## whitespace has been parsed
+    xmlComment,         ## a comment has been parsed
+    xmlPI,              ## processing instruction (``<?name something ?>``)
+    xmlElementStart,    ## ``<elem>``
+    xmlElementEnd,      ## ``</elem>``
+    xmlElementOpen,     ## ``<elem 
+    xmlAttribute,       ## ``key = "value"`` pair
+    xmlElementClose,    ## ``>`` 
+    xmlCData,           ## ``<![CDATA[`` ... data ... ``]]>``
+    xmlEntity,          ## &entity;
+    xmlSpecial          ## ``<! ... data ... >``
+    
+  TXmlError* = enum          ## enumeration that lists all errors that can occur
+    errNone,                 ## no error
+    errEndOfCDataExpected,   ## ``]]>`` expected
+    errNameExpected,         ## name expected
+    errSemicolonExpected,    ## ``;`` expected
+    errQmGtExpected,         ## ``?>`` expected
+    errGtExpected,           ## ``>`` expected
+    errEqExpected,           ## ``=`` expected
+    errQuoteExpected,        ## ``"`` or ``'`` expected
+    errEndOfCommentExpected  ## ``-->`` expected
+    
+  TParserState = enum 
+    stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
+
+  TXmlParseOption* = enum  ## options for the XML parser
+    reportWhitespace,      ## report whitespace
+    reportComments         ## report comments
+
+  TXmlParser* = object of TBaseLexer ## the parser object.
+    a, b: string
+    kind: TXmlEventKind
+    err: TXmlError
+    state: TParserState
+    filename: string
+    options: set[TXmlParseOption]
+ 
+const
+  errorMessages: array [TXmlError, string] = [
+    "no error",
+    "']]>' expected",
+    "name expected",
+    "';' expected",
+    "'?>' expected",
+    "'>' expected",
+    "'=' expected",
+    "'\"' or \"'\" expected",
+    "'-->' expected"
+  ]
+
+proc open*(my: var TXmlParser, input: PStream, filename: string,
+           options: set[TXmlParseOption] = {}) =
+  ## initializes the parser with an input stream. `Filename` is only used
+  ## for nice error messages. The parser's behaviour can be controlled by
+  ## the `options` parameter: If `options` contains ``reportWhitespace``
+  ## a whitespace token is reported as an ``xmlWhitespace`` event.
+  ## If `options` contains ``reportComments`` a comment token is reported as an
+  ## ``xmlComment`` event. 
+  lexbase.open(my, input)
+  my.filename = filename
+  my.state = stateStart
+  my.kind = xmlError
+  my.a = ""
+  my.b = ""
+  my.options = options
+  
+proc close*(my: var TXmlParser) {.inline.} = 
+  ## closes the parser `my` and its associated input stream.
+  lexbase.close(my)
+
+proc charData*(my: TXmlParser): string {.inline.} = 
+  ## returns the character data for the events: ``xmlCharData``, 
+  ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
+  assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, 
+                     xmlSpecial})
+  return my.a
+
+proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = 
+  ## returns the current event type for the XML parser
+  return my.kind
+
+proc elementName*(my: TXmlParser): string {.inline.} = 
+  ## returns the element name for the events: ``xmlElementStart``, 
+  ## ``xmlElementEnd``, ``xmlElementOpen``
+  assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
+  return my.a
+
+proc entityName*(my: TXmlParser): string {.inline.} = 
+  ## returns the entity name for the event: ``xmlEntity``
+  assert(my.kind == xmlEntity)
+  return my.a
+  
+proc attrKey*(my: TXmlParser): string {.inline.} = 
+  ## returns the attribute key for the event ``xmlAttribute``
+  assert(my.kind == xmlAttribute)
+  return my.a
+  
+proc attrValue*(my: TXmlParser): string {.inline.} = 
+  ## returns the attribute value for the event ``xmlAttribute``
+  assert(my.kind == xmlAttribute)
+  return my.b
+
+proc PIName*(my: TXmlParser): string {.inline.} = 
+  ## returns the processing instruction name for the event ``xmlPI``
+  assert(my.kind == xmlPI)
+  return my.a
+
+proc PIRest*(my: TXmlParser): string {.inline.} = 
+  ## returns the rest of the processing instruction for the event ``xmlPI``
+  assert(my.kind == xmlPI)
+  return my.b
+
+proc getColumn*(my: TXmlParser): int {.inline.} = 
+  ## get the current column the parser has arrived at.
+  result = getColNumber(my, my.bufPos)
+
+proc getLine*(my: TXmlParser): int {.inline.} = 
+  ## get the current line the parser has arrived at.
+  result = my.linenumber
+
+proc getFilename*(my: TXmlParser): string {.inline.} = 
+  ## get the filename of the file that the parser processes.
+  result = my.filename
+  
+proc errorMsg*(my: TXmlParser): string = 
+  ## returns a helpful error message for the event ``xmlError``
+  assert(my.kind == xmlError)
+  result = "$1($2, $3) Error: $4" % [
+    my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
+
+proc errorMsgExpected*(my: TXmlParser, tag: string): string = 
+  ## returns an error message "<tag> expected" in the same format as the
+  ## other error messages 
+  result = "$1($2, $3) Error: $4" % [
+    my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
+    
+proc markError(my: var TXmlParser, kind: TXmlError) {.inline.} = 
+  my.err = kind
+  my.state = stateError
+
+proc parseCDATA(my: var TXMLParser) = 
+  var pos = my.bufpos + len("<![CDATA[")
+  var buf = my.buf
+  while true:
+    case buf[pos] 
+    of ']':
+      if buf[pos+1] == ']' and buf[pos+2] == '>':
+        inc(pos, 3)
+        break
+      add(my.a, ']')
+      inc(pos)
+    of '\0': 
+      markError(my, errEndOfCDataExpected)
+      break
+    of '\c': 
+      pos = lexbase.HandleCR(my, pos)
+      buf = my.buf
+      add(my.a, '\L')
+    of '\L': 
+      pos = lexbase.HandleLF(my, pos)
+      buf = my.buf
+      add(my.a, '\L')
+    else:
+      add(my.a, buf[pos])
+      inc(pos)    
+  my.bufpos = pos # store back
+  my.kind = xmlCDATA
+
+proc parseComment(my: var TXMLParser) = 
+  var pos = my.bufpos + len("<!--")
+  var buf = my.buf
+  while true:
+    case buf[pos] 
+    of '-':
+      if buf[pos+1] == '-' and buf[pos+2] == '>':
+        inc(pos, 3)
+        break
+      if my.options.contains(reportComments): add(my.a, '-')
+      inc(pos)
+    of '\0': 
+      markError(my, errEndOfCommentExpected)
+      break
+    of '\c': 
+      pos = lexbase.HandleCR(my, pos)
+      buf = my.buf
+      if my.options.contains(reportComments): add(my.a, '\L')
+    of '\L': 
+      pos = lexbase.HandleLF(my, pos)
+      buf = my.buf
+      if my.options.contains(reportComments): add(my.a, '\L')
+    else:
+      if my.options.contains(reportComments): add(my.a, buf[pos])
+      inc(pos)
+  my.bufpos = pos
+  my.kind = xmlComment
+
+proc parseWhitespace(my: var TXmlParser, skip=False) = 
+  var pos = my.bufpos
+  var buf = my.buf
+  while true: 
+    case buf[pos]
+    of ' ', '\t': 
+      if not skip: add(my.a, buf[pos])
+      Inc(pos)
+    of '\c':  
+      # the specification says that CR-LF, CR are to be transformed to LF
+      pos = lexbase.HandleCR(my, pos)
+      buf = my.buf
+      if not skip: add(my.a, '\L')
+    of '\L': 
+      pos = lexbase.HandleLF(my, pos)
+      buf = my.buf
+      if not skip: add(my.a, '\L')
+    else:
+      break
+  my.bufpos = pos
+
+const
+  NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
+  NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
+
+proc parseName(my: var TXmlParser, dest: var string) = 
+  var pos = my.bufpos
+  var buf = my.buf
+  if buf[pos] in nameStartChar: 
+    while true:
+      add(dest, buf[pos])
+      inc(pos)
+      if buf[pos] notin NameChar: break
+    my.bufpos = pos
+  else:
+    markError(my, errNameExpected)
+
+proc parseEntity(my: var TXmlParser, dest: var string) = 
+  var pos = my.bufpos+1
+  var buf = my.buf
+  my.kind = xmlCharData
+  if buf[pos] == '#':
+    var r: TRune
+    inc(pos)
+    if buf[pos] == 'x': 
+      inc(pos)
+      while true:
+        case buf[pos]
+        of '0'..'9': r = (r shl 4) or (ord(buf[pos]) - ord('0'))
+        of 'a'..'f': r = (r shl 4) or (ord(buf[pos]) - ord('a') + 10)
+        of 'A'..'F': r = (r shl 4) or (ord(buf[pos]) - ord('A') + 10)
+        else: break
+        inc(pos)
+    else:
+      while buf[pos] in {'0'..'9'}: 
+        r = r * 10 + (ord(buf[pos]) - ord('0'))
+        inc(pos)
+    add(dest, toUTF8(r))
+  elif buf[pos] == 'l' and buf[pos+1] == 't':
+    add(dest, '<')
+    inc(pos, 2)
+  elif buf[pos] == 'g' and buf[pos+1] == 't':
+    add(dest, '>')
+    inc(pos, 2)
+  elif buf[pos] == 'a' and buf[pos+1] == 'm' and buf[pos+2] == 'p':
+    add(dest, '&')
+    inc(pos, 3)
+  elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and 
+      buf[pos+3] == 's':
+    add(dest, '\'')
+    inc(pos, 4)
+  elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and 
+      buf[pos+3] == 't':
+    add(dest, '"')
+    inc(pos, 4)
+  else:
+    my.bufpos = pos
+    parseName(my, dest)
+    pos = my.bufpos
+    if my.err != errNameExpected: 
+      my.kind = xmlEntity
+    else:
+      add(dest, '&')
+  if buf[pos] == ';': 
+    inc(pos)
+  else:
+    markError(my, errSemiColonExpected)
+  my.bufpos = pos
+
+proc parsePI(my: var TXmlParser) = 
+  inc(my.bufpos, "<?".len)
+  parseName(my, my.a)
+  var pos = my.bufpos
+  var buf = my.buf
+  setLen(my.b, 0)
+  while true: 
+    case buf[pos]
+    of '\0':
+      markError(my, errQmGtExpected)
+      break
+    of '?':
+      if buf[pos+1] == '>':
+        inc(pos, 2)
+        break
+      add(my.b, '?')
+      inc(pos)
+    of '\c':  
+      # the specification says that CR-LF, CR are to be transformed to LF
+      pos = lexbase.HandleCR(my, pos)
+      buf = my.buf      
+      add(my.b, '\L')
+    of '\L': 
+      pos = lexbase.HandleLF(my, pos)
+      buf = my.buf
+      add(my.b, '\L')
+    else:
+      add(my.b, buf[pos])
+      inc(pos)
+  my.bufpos = pos
+  my.kind = xmlPI
+
+proc parseSpecial(my: var TXmlParser) = 
+  # things that start with <!
+  var pos = my.bufpos + 2
+  var buf = my.buf
+  var opentags = 0
+  while true: 
+    case buf[pos]
+    of '\0':
+      markError(my, errGtExpected)
+      break
+    of '<': 
+      inc(opentags)
+      inc(pos)
+      add(my.a, '<')
+    of '>':
+      if opentags <= 0:
+        inc(pos)
+        break
+      dec(opentags)
+      inc(pos)
+      add(my.a, '>')
+    of '\c':  
+      pos = lexbase.HandleCR(my, pos)
+      buf = my.buf
+      add(my.a, '\L')
+    of '\L': 
+      pos = lexbase.HandleLF(my, pos)
+      buf = my.buf
+      add(my.a, '\L')
+    else:
+      add(my.a, buf[pos])
+      inc(pos)
+  my.bufpos = pos
+  my.kind = xmlSpecial
+
+proc parseTag(my: var TXmlParser) = 
+  inc(my.bufpos)
+  parseName(my, my.a)
+  # if we have no name, do not interpret the '<':
+  if my.a.len == 0: 
+    my.kind = xmlCharData
+    add(my.a, '<')
+    return
+  parseWhitespace(my, skip=True)
+  if my.buf[my.bufpos] in NameStartChar: 
+    # an attribute follows:
+    my.kind = xmlElementOpen
+    my.state = stateAttr
+  else:
+    my.kind = xmlElementStart
+    if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
+      inc(my.bufpos, 2)
+      my.state = stateEmptyElementTag
+    elif my.buf[my.bufpos] == '>':
+      inc(my.bufpos)  
+    else:
+      markError(my, errGtExpected)
+  
+proc parseEndTag(my: var TXmlParser) = 
+  inc(my.bufpos, 2)
+  parseName(my, my.a)
+  parseWhitespace(my, skip=True)
+  if my.buf[my.bufpos] == '>':
+    inc(my.bufpos)
+  else:
+    markError(my, errGtExpected)
+  my.kind = xmlElementEnd
+
+proc parseAttribute(my: var TXmlParser) = 
+  my.kind = xmlAttribute
+  setLen(my.a, 0)
+  setLen(my.b, 0)
+  parseName(my, my.a)
+  # if we have no name, we have '<tag attr= key %&$$%':
+  if my.a.len == 0: 
+    markError(my, errGtExpected)
+    return
+  parseWhitespace(my, skip=True)
+  if my.buf[my.bufpos] != '=':
+    markError(my, errEqExpected)
+    return
+  inc(my.bufpos)
+  parseWhitespace(my, skip=True)
+
+  var pos = my.bufpos
+  var buf = my.buf
+  if buf[pos] in {'\'', '"'}:
+    var quote = buf[pos]
+    var pendingSpace = false
+    inc(pos)
+    while true: 
+      case buf[pos]
+      of '\0':
+        markError(my, errQuoteExpected)
+        break
+      of '&': 
+        if pendingSpace: 
+          add(my.b, ' ')
+          pendingSpace = false
+        my.bufpos = pos
+        parseEntity(my, my.b)
+        my.kind = xmlAttribute # parseEntity overwrites my.kind!
+        pos = my.bufpos
+      of ' ', '\t': 
+        pendingSpace = true
+        inc(pos)
+      of '\c':  
+        pos = lexbase.HandleCR(my, pos)
+        buf = my.buf
+        pendingSpace = true
+      of '\L': 
+        pos = lexbase.HandleLF(my, pos)
+        buf = my.buf
+        pendingSpace = true
+      else:
+        if buf[pos] == quote:
+          inc(pos)
+          break
+        else:
+          if pendingSpace: 
+            add(my.b, ' ')
+            pendingSpace = false
+          add(my.b, buf[pos])
+          inc(pos)
+  else:
+    markError(my, errQuoteExpected)  
+  my.bufpos = pos
+  parseWhitespace(my, skip=True)
+  
+proc parseCharData(my: var TXmlParser) = 
+  var pos = my.bufpos
+  var buf = my.buf
+  while true: 
+    case buf[pos]
+    of '\0', '<', '&': break
+    of '\c':  
+      # the specification says that CR-LF, CR are to be transformed to LF
+      pos = lexbase.HandleCR(my, pos)
+      buf = my.buf
+      add(my.a, '\L')
+    of '\L': 
+      pos = lexbase.HandleLF(my, pos)
+      buf = my.buf
+      add(my.a, '\L')
+    else:
+      add(my.a, buf[pos])
+      inc(pos)
+  my.bufpos = pos
+  my.kind = xmlCharData
+
+proc rawGetTok(my: var TXmlParser) = 
+  my.kind = xmlError
+  setLen(my.a, 0)
+  var pos = my.bufpos
+  var buf = my.buf
+  case buf[pos]
+  of '<': 
+    case buf[pos+1]
+    of '/':
+      parseEndTag(my)
+    of '!':
+      if buf[pos+2] == '[' and buf[pos+3] == 'C' and buf[pos+4] == 'D' and
+          buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
+          buf[pos+8] == '[':
+        parseCDATA(my)
+      elif buf[pos+2] == '-' and buf[pos+3] == '-': 
+        parseComment(my)
+      else: 
+        parseSpecial(my)
+    of '?':
+      parsePI(my)
+    else: 
+      parseTag(my)
+  of ' ', '\t', '\c', '\l': 
+    parseWhiteSpace(my)
+    my.kind = xmlWhitespace
+  of '\0': 
+    my.kind = xmlEof
+  of '&':
+    parseEntity(my, my.a)
+  else: 
+    parseCharData(my)
+  assert my.kind != xmlError
+    
+proc getTok(my: var TXmlParser) = 
+  while true:
+    rawGetTok(my)
+    case my.kind
+    of xmlComment: 
+      if my.options.contains(reportComments): break
+    of xmlWhitespace: 
+      if my.options.contains(reportWhitespace): break
+    else: break
+    
+proc next*(my: var TXmlParser) = 
+  ## retrieves the first/next event. This controls the parser.
+  case my.state
+  of stateNormal:
+    getTok(my)  
+  of stateStart:
+    getTok(my)
+    if my.kind == xmlPI and my.a == "xml": 
+      # just skip the first ``<?xml >`` processing instruction
+      getTok(my)
+    my.state = stateNormal
+  of stateAttr:
+    # parse an attribute key-value pair:
+    if my.buf[my.bufpos] == '>':
+      my.kind = xmlElementClose
+      inc(my.bufpos)
+      my.state = stateNormal
+    elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': 
+      my.kind = xmlElementClose
+      inc(my.bufpos, 2)
+      my.state = stateEmptyElementTag
+    else:
+      parseAttribute(my)
+      # state remains the same
+  of stateEmptyElementTag:
+    my.state = stateNormal
+    my.kind = xmlElementEnd
+  of stateError: 
+    my.kind = xmlError
+    my.state = stateNormal
+  
+when isMainModule:
+  import os
+  var s = newFileStream(ParamStr(1), fmRead)
+  if s == nil: quit("cannot open the file" & ParamStr(1))
+  var x: TXmlParser
+  open(x, s, ParamStr(1))
+  while true:
+    next(x)
+    case x.kind
+    of xmlError: Echo(x.errorMsg())
+    of xmlEof: break
+    of xmlCharData: echo(x.charData)
+    of xmlWhitespace: echo("|$1|" % x.charData)
+    of xmlComment: echo("<!-- $1 -->" % x.charData)
+    of xmlPI: echo("<? $1 ## $2 ?>" % [x.PIName, x.PIRest])
+    of xmlElementStart: echo("<$1>" % x.elementName)
+    of xmlElementEnd: echo("</$1>" % x.elementName)
+    
+    of xmlElementOpen: echo("<$1" % x.elementName) 
+    of xmlAttribute:   
+      echo("Key: " & x.attrKey)
+      echo("Value: " & x.attrValue)
+    
+    of xmlElementClose: echo(">") 
+    of xmlCData:
+      echo("<![CDATA[$1]]>" % x.charData)
+    of xmlEntity:
+      echo("&$1;" % x.entityName)
+    of xmlSpecial:
+      echo("SPECIAL: " & x.charData)
+  close(x)
+