summary refs log tree commit diff stats
path: root/nimlib/pure/parsexml.nim
diff options
context:
space:
mode:
Diffstat (limited to 'nimlib/pure/parsexml.nim')
-rwxr-xr-xnimlib/pure/parsexml.nim635
1 files changed, 0 insertions, 635 deletions
diff --git a/nimlib/pure/parsexml.nim b/nimlib/pure/parsexml.nim
deleted file mode 100755
index 54f62a9a4..000000000
--- a/nimlib/pure/parsexml.nim
+++ /dev/null
@@ -1,635 +0,0 @@
-#
-#
-#            Nimrod's Runtime Library
-#        (c) Copyright 2009 Andreas Rumpf
-#
-#    See the file "copying.txt", included in this
-#    distribution, for details about the copyright.
-#
-
-## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
-## parser. 
-## The only encoding that is supported is UTF-8. The parser has been designed
-## to be somewhat error correcting, so that even most "wild HTML" found on the 
-## web can be parsed with it. **Note:** This parser does not check that each
-## ``<tag>`` has a corresponding ``</tag>``! These checks have do be 
-## implemented by the client code for various reasons: 
-##
-## * Old HTML contains tags that have no end tag: ``<br>`` for example.
-## * HTML tags are case insensitive, XML tags are case sensitive. Since this
-##   library can parse both, only the client knows which comparison is to be
-##   used.
-## * Thus the checks would have been very difficult to implement properly with
-##   little benefit, especially since they are simple to implement in the 
-##   client. The client should use the `errorMsgExpected` proc to generate
-##   a nice error message that fits the other error messages this library
-##   creates.
-##
-##
-## Example 1: Retrieve HTML title
-## ==============================
-##
-## The file ``examples/htmltitle.nim`` demonstrates how to use the 
-## XML parser to accomplish a simple task: To determine the title of an HTML
-## document.
-##
-## .. code-block:: nimrod
-##     :file: examples/htmltitle.nim
-##
-##
-## Example 2: Retrieve all HTML links
-## ==================================
-##
-## The file ``examples/htmlrefs.nim`` demonstrates how to use the 
-## XML parser to accomplish another simple task: To determine all the links 
-## an HTML document contains.
-##
-## .. code-block:: nimrod
-##     :file: examples/htmlrefs.nim
-##
-
-import 
-  hashes, strutils, lexbase, streams, unicode
-
-# the parser treats ``<br />`` as ``<br></br>``
-
-type 
-  TXmlEventKind* = enum ## enumation of all events that may occur when parsing
-    xmlError,           ## an error ocurred during parsing
-    xmlEof,             ## end of file reached
-    xmlCharData,        ## character data
-    xmlWhitespace,      ## whitespace has been parsed
-    xmlComment,         ## a comment has been parsed
-    xmlPI,              ## processing instruction (``<?name something ?>``)
-    xmlElementStart,    ## ``<elem>``
-    xmlElementEnd,      ## ``</elem>``
-    xmlElementOpen,     ## ``<elem 
-    xmlAttribute,       ## ``key = "value"`` pair
-    xmlElementClose,    ## ``>`` 
-    xmlCData,           ## ``<![CDATA[`` ... data ... ``]]>``
-    xmlEntity,          ## &entity;
-    xmlSpecial          ## ``<! ... data ... >``
-    
-  TXmlError* = enum          ## enumeration that lists all errors that can occur
-    errNone,                 ## no error
-    errEndOfCDataExpected,   ## ``]]>`` expected
-    errNameExpected,         ## name expected
-    errSemicolonExpected,    ## ``;`` expected
-    errQmGtExpected,         ## ``?>`` expected
-    errGtExpected,           ## ``>`` expected
-    errEqExpected,           ## ``=`` expected
-    errQuoteExpected,        ## ``"`` or ``'`` expected
-    errEndOfCommentExpected  ## ``-->`` expected
-    
-  TParserState = enum 
-    stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
-
-  TXmlParseOption* = enum  ## options for the XML parser
-    reportWhitespace,      ## report whitespace
-    reportComments         ## report comments
-
-  TXmlParser* = object of TBaseLexer ## the parser object.
-    a, b: string
-    kind: TXmlEventKind
-    err: TXmlError
-    state: TParserState
-    filename: string
-    options: set[TXmlParseOption]
- 
-const
-  errorMessages: array [TXmlError, string] = [
-    "no error",
-    "']]>' expected",
-    "name expected",
-    "';' expected",
-    "'?>' expected",
-    "'>' expected",
-    "'=' expected",
-    "'\"' or \"'\" expected",
-    "'-->' expected"
-  ]
-
-proc open*(my: var TXmlParser, input: PStream, filename: string,
-           options: set[TXmlParseOption] = {}) =
-  ## initializes the parser with an input stream. `Filename` is only used
-  ## for nice error messages. The parser's behaviour can be controlled by
-  ## the `options` parameter: If `options` contains ``reportWhitespace``
-  ## a whitespace token is reported as an ``xmlWhitespace`` event.
-  ## If `options` contains ``reportComments`` a comment token is reported as an
-  ## ``xmlComment`` event. 
-  lexbase.open(my, input)
-  my.filename = filename
-  my.state = stateStart
-  my.kind = xmlError
-  my.a = ""
-  my.b = ""
-  my.options = options
-  
-proc close*(my: var TXmlParser) {.inline.} = 
-  ## closes the parser `my` and its associated input stream.
-  lexbase.close(my)
-
-proc charData*(my: TXmlParser): string {.inline.} = 
-  ## returns the character data for the events: ``xmlCharData``, 
-  ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
-  assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, 
-                     xmlSpecial})
-  return my.a
-
-proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = 
-  ## returns the current event type for the XML parser
-  return my.kind
-
-proc elementName*(my: TXmlParser): string {.inline.} = 
-  ## returns the element name for the events: ``xmlElementStart``, 
-  ## ``xmlElementEnd``, ``xmlElementOpen``
-  assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
-  return my.a
-
-proc entityName*(my: TXmlParser): string {.inline.} = 
-  ## returns the entity name for the event: ``xmlEntity``
-  assert(my.kind == xmlEntity)
-  return my.a
-  
-proc attrKey*(my: TXmlParser): string {.inline.} = 
-  ## returns the attribute key for the event ``xmlAttribute``
-  assert(my.kind == xmlAttribute)
-  return my.a
-  
-proc attrValue*(my: TXmlParser): string {.inline.} = 
-  ## returns the attribute value for the event ``xmlAttribute``
-  assert(my.kind == xmlAttribute)
-  return my.b
-
-proc PIName*(my: TXmlParser): string {.inline.} = 
-  ## returns the processing instruction name for the event ``xmlPI``
-  assert(my.kind == xmlPI)
-  return my.a
-
-proc PIRest*(my: TXmlParser): string {.inline.} = 
-  ## returns the rest of the processing instruction for the event ``xmlPI``
-  assert(my.kind == xmlPI)
-  return my.b
-
-proc getColumn*(my: TXmlParser): int {.inline.} = 
-  ## get the current column the parser has arrived at.
-  result = getColNumber(my, my.bufPos)
-
-proc getLine*(my: TXmlParser): int {.inline.} = 
-  ## get the current line the parser has arrived at.
-  result = my.linenumber
-
-proc getFilename*(my: TXmlParser): string {.inline.} = 
-  ## get the filename of the file that the parser processes.
-  result = my.filename
-  
-proc errorMsg*(my: TXmlParser): string = 
-  ## returns a helpful error message for the event ``xmlError``
-  assert(my.kind == xmlError)
-  result = "$1($2, $3) Error: $4" % [
-    my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
-
-proc errorMsgExpected*(my: TXmlParser, tag: string): string = 
-  ## returns an error message "<tag> expected" in the same format as the
-  ## other error messages 
-  result = "$1($2, $3) Error: $4" % [
-    my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
-    
-proc markError(my: var TXmlParser, kind: TXmlError) {.inline.} = 
-  my.err = kind
-  my.state = stateError
-
-proc parseCDATA(my: var TXMLParser) = 
-  var pos = my.bufpos + len("<![CDATA[")
-  var buf = my.buf
-  while true:
-    case buf[pos] 
-    of ']':
-      if buf[pos+1] == ']' and buf[pos+2] == '>':
-        inc(pos, 3)
-        break
-      add(my.a, ']')
-      inc(pos)
-    of '\0': 
-      markError(my, errEndOfCDataExpected)
-      break
-    of '\c': 
-      pos = lexbase.HandleCR(my, pos)
-      buf = my.buf
-      add(my.a, '\L')
-    of '\L': 
-      pos = lexbase.HandleLF(my, pos)
-      buf = my.buf
-      add(my.a, '\L')
-    else:
-      add(my.a, buf[pos])
-      inc(pos)    
-  my.bufpos = pos # store back
-  my.kind = xmlCDATA
-
-proc parseComment(my: var TXMLParser) = 
-  var pos = my.bufpos + len("<!--")
-  var buf = my.buf
-  while true:
-    case buf[pos] 
-    of '-':
-      if buf[pos+1] == '-' and buf[pos+2] == '>':
-        inc(pos, 3)
-        break
-      if my.options.contains(reportComments): add(my.a, '-')
-      inc(pos)
-    of '\0': 
-      markError(my, errEndOfCommentExpected)
-      break
-    of '\c': 
-      pos = lexbase.HandleCR(my, pos)
-      buf = my.buf
-      if my.options.contains(reportComments): add(my.a, '\L')
-    of '\L': 
-      pos = lexbase.HandleLF(my, pos)
-      buf = my.buf
-      if my.options.contains(reportComments): add(my.a, '\L')
-    else:
-      if my.options.contains(reportComments): add(my.a, buf[pos])
-      inc(pos)
-  my.bufpos = pos
-  my.kind = xmlComment
-
-proc parseWhitespace(my: var TXmlParser, skip=False) = 
-  var pos = my.bufpos
-  var buf = my.buf
-  while true: 
-    case buf[pos]
-    of ' ', '\t': 
-      if not skip: add(my.a, buf[pos])
-      Inc(pos)
-    of '\c':  
-      # the specification says that CR-LF, CR are to be transformed to LF
-      pos = lexbase.HandleCR(my, pos)
-      buf = my.buf
-      if not skip: add(my.a, '\L')
-    of '\L': 
-      pos = lexbase.HandleLF(my, pos)
-      buf = my.buf
-      if not skip: add(my.a, '\L')
-    else:
-      break
-  my.bufpos = pos
-
-const
-  NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
-  NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
-
-proc parseName(my: var TXmlParser, dest: var string) = 
-  var pos = my.bufpos
-  var buf = my.buf
-  if buf[pos] in nameStartChar: 
-    while true:
-      add(dest, buf[pos])
-      inc(pos)
-      if buf[pos] notin NameChar: break
-    my.bufpos = pos
-  else:
-    markError(my, errNameExpected)
-
-proc parseEntity(my: var TXmlParser, dest: var string) = 
-  var pos = my.bufpos+1
-  var buf = my.buf
-  my.kind = xmlCharData
-  if buf[pos] == '#':
-    var r: int
-    inc(pos)
-    if buf[pos] == 'x': 
-      inc(pos)
-      while true:
-        case buf[pos]
-        of '0'..'9': r = (r shl 4) or (ord(buf[pos]) - ord('0'))
-        of 'a'..'f': r = (r shl 4) or (ord(buf[pos]) - ord('a') + 10)
-        of 'A'..'F': r = (r shl 4) or (ord(buf[pos]) - ord('A') + 10)
-        else: break
-        inc(pos)
-    else:
-      while buf[pos] in {'0'..'9'}: 
-        r = r * 10 + (ord(buf[pos]) - ord('0'))
-        inc(pos)
-    add(dest, toUTF8(TRune(r)))
-  elif buf[pos] == 'l' and buf[pos+1] == 't':
-    add(dest, '<')
-    inc(pos, 2)
-  elif buf[pos] == 'g' and buf[pos+1] == 't':
-    add(dest, '>')
-    inc(pos, 2)
-  elif buf[pos] == 'a' and buf[pos+1] == 'm' and buf[pos+2] == 'p':
-    add(dest, '&')
-    inc(pos, 3)
-  elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and 
-      buf[pos+3] == 's':
-    add(dest, '\'')
-    inc(pos, 4)
-  elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and 
-      buf[pos+3] == 't':
-    add(dest, '"')
-    inc(pos, 4)
-  else:
-    my.bufpos = pos
-    parseName(my, dest)
-    pos = my.bufpos
-    if my.err != errNameExpected: 
-      my.kind = xmlEntity
-    else:
-      add(dest, '&')
-  if buf[pos] == ';': 
-    inc(pos)
-  else:
-    markError(my, errSemiColonExpected)
-  my.bufpos = pos
-
-proc parsePI(my: var TXmlParser) = 
-  inc(my.bufpos, "<?".len)
-  parseName(my, my.a)
-  var pos = my.bufpos
-  var buf = my.buf
-  setLen(my.b, 0)
-  while true: 
-    case buf[pos]
-    of '\0':
-      markError(my, errQmGtExpected)
-      break
-    of '?':
-      if buf[pos+1] == '>':
-        inc(pos, 2)
-        break
-      add(my.b, '?')
-      inc(pos)
-    of '\c':  
-      # the specification says that CR-LF, CR are to be transformed to LF
-      pos = lexbase.HandleCR(my, pos)
-      buf = my.buf      
-      add(my.b, '\L')
-    of '\L': 
-      pos = lexbase.HandleLF(my, pos)
-      buf = my.buf
-      add(my.b, '\L')
-    else:
-      add(my.b, buf[pos])
-      inc(pos)
-  my.bufpos = pos
-  my.kind = xmlPI
-
-proc parseSpecial(my: var TXmlParser) = 
-  # things that start with <!
-  var pos = my.bufpos + 2
-  var buf = my.buf
-  var opentags = 0
-  while true: 
-    case buf[pos]
-    of '\0':
-      markError(my, errGtExpected)
-      break
-    of '<': 
-      inc(opentags)
-      inc(pos)
-      add(my.a, '<')
-    of '>':
-      if opentags <= 0:
-        inc(pos)
-        break
-      dec(opentags)
-      inc(pos)
-      add(my.a, '>')
-    of '\c':  
-      pos = lexbase.HandleCR(my, pos)
-      buf = my.buf
-      add(my.a, '\L')
-    of '\L': 
-      pos = lexbase.HandleLF(my, pos)
-      buf = my.buf
-      add(my.a, '\L')
-    else:
-      add(my.a, buf[pos])
-      inc(pos)
-  my.bufpos = pos
-  my.kind = xmlSpecial
-
-proc parseTag(my: var TXmlParser) = 
-  inc(my.bufpos)
-  parseName(my, my.a)
-  # if we have no name, do not interpret the '<':
-  if my.a.len == 0: 
-    my.kind = xmlCharData
-    add(my.a, '<')
-    return
-  parseWhitespace(my, skip=True)
-  if my.buf[my.bufpos] in NameStartChar: 
-    # an attribute follows:
-    my.kind = xmlElementOpen
-    my.state = stateAttr
-  else:
-    my.kind = xmlElementStart
-    if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
-      inc(my.bufpos, 2)
-      my.state = stateEmptyElementTag
-    elif my.buf[my.bufpos] == '>':
-      inc(my.bufpos)  
-    else:
-      markError(my, errGtExpected)
-  
-proc parseEndTag(my: var TXmlParser) = 
-  inc(my.bufpos, 2)
-  parseName(my, my.a)
-  parseWhitespace(my, skip=True)
-  if my.buf[my.bufpos] == '>':
-    inc(my.bufpos)
-  else:
-    markError(my, errGtExpected)
-  my.kind = xmlElementEnd
-
-proc parseAttribute(my: var TXmlParser) = 
-  my.kind = xmlAttribute
-  setLen(my.a, 0)
-  setLen(my.b, 0)
-  parseName(my, my.a)
-  # if we have no name, we have '<tag attr= key %&$$%':
-  if my.a.len == 0: 
-    markError(my, errGtExpected)
-    return
-  parseWhitespace(my, skip=True)
-  if my.buf[my.bufpos] != '=':
-    markError(my, errEqExpected)
-    return
-  inc(my.bufpos)
-  parseWhitespace(my, skip=True)
-
-  var pos = my.bufpos
-  var buf = my.buf
-  if buf[pos] in {'\'', '"'}:
-    var quote = buf[pos]
-    var pendingSpace = false
-    inc(pos)
-    while true: 
-      case buf[pos]
-      of '\0':
-        markError(my, errQuoteExpected)
-        break
-      of '&': 
-        if pendingSpace: 
-          add(my.b, ' ')
-          pendingSpace = false
-        my.bufpos = pos
-        parseEntity(my, my.b)
-        my.kind = xmlAttribute # parseEntity overwrites my.kind!
-        pos = my.bufpos
-      of ' ', '\t': 
-        pendingSpace = true
-        inc(pos)
-      of '\c':  
-        pos = lexbase.HandleCR(my, pos)
-        buf = my.buf
-        pendingSpace = true
-      of '\L': 
-        pos = lexbase.HandleLF(my, pos)
-        buf = my.buf
-        pendingSpace = true
-      else:
-        if buf[pos] == quote:
-          inc(pos)
-          break
-        else:
-          if pendingSpace: 
-            add(my.b, ' ')
-            pendingSpace = false
-          add(my.b, buf[pos])
-          inc(pos)
-  else:
-    markError(my, errQuoteExpected)  
-  my.bufpos = pos
-  parseWhitespace(my, skip=True)
-  
-proc parseCharData(my: var TXmlParser) = 
-  var pos = my.bufpos
-  var buf = my.buf
-  while true: 
-    case buf[pos]
-    of '\0', '<', '&': break
-    of '\c':  
-      # the specification says that CR-LF, CR are to be transformed to LF
-      pos = lexbase.HandleCR(my, pos)
-      buf = my.buf
-      add(my.a, '\L')
-    of '\L': 
-      pos = lexbase.HandleLF(my, pos)
-      buf = my.buf
-      add(my.a, '\L')
-    else:
-      add(my.a, buf[pos])
-      inc(pos)
-  my.bufpos = pos
-  my.kind = xmlCharData
-
-proc rawGetTok(my: var TXmlParser) = 
-  my.kind = xmlError
-  setLen(my.a, 0)
-  var pos = my.bufpos
-  var buf = my.buf
-  case buf[pos]
-  of '<': 
-    case buf[pos+1]
-    of '/':
-      parseEndTag(my)
-    of '!':
-      if buf[pos+2] == '[' and buf[pos+3] == 'C' and buf[pos+4] == 'D' and
-          buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
-          buf[pos+8] == '[':
-        parseCDATA(my)
-      elif buf[pos+2] == '-' and buf[pos+3] == '-': 
-        parseComment(my)
-      else: 
-        parseSpecial(my)
-    of '?':
-      parsePI(my)
-    else: 
-      parseTag(my)
-  of ' ', '\t', '\c', '\l': 
-    parseWhiteSpace(my)
-    my.kind = xmlWhitespace
-  of '\0': 
-    my.kind = xmlEof
-  of '&':
-    parseEntity(my, my.a)
-  else: 
-    parseCharData(my)
-  assert my.kind != xmlError
-    
-proc getTok(my: var TXmlParser) = 
-  while true:
-    rawGetTok(my)
-    case my.kind
-    of xmlComment: 
-      if my.options.contains(reportComments): break
-    of xmlWhitespace: 
-      if my.options.contains(reportWhitespace): break
-    else: break
-    
-proc next*(my: var TXmlParser) = 
-  ## retrieves the first/next event. This controls the parser.
-  case my.state
-  of stateNormal:
-    getTok(my)  
-  of stateStart:
-    getTok(my)
-    if my.kind == xmlPI and my.a == "xml": 
-      # just skip the first ``<?xml >`` processing instruction
-      getTok(my)
-    my.state = stateNormal
-  of stateAttr:
-    # parse an attribute key-value pair:
-    if my.buf[my.bufpos] == '>':
-      my.kind = xmlElementClose
-      inc(my.bufpos)
-      my.state = stateNormal
-    elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': 
-      my.kind = xmlElementClose
-      inc(my.bufpos, 2)
-      my.state = stateEmptyElementTag
-    else:
-      parseAttribute(my)
-      # state remains the same
-  of stateEmptyElementTag:
-    my.state = stateNormal
-    my.kind = xmlElementEnd
-  of stateError: 
-    my.kind = xmlError
-    my.state = stateNormal
-  
-when isMainModule:
-  import os
-  var s = newFileStream(ParamStr(1), fmRead)
-  if s == nil: quit("cannot open the file" & ParamStr(1))
-  var x: TXmlParser
-  open(x, s, ParamStr(1))
-  while true:
-    next(x)
-    case x.kind
-    of xmlError: Echo(x.errorMsg())
-    of xmlEof: break
-    of xmlCharData: echo(x.charData)
-    of xmlWhitespace: echo("|$1|" % x.charData)
-    of xmlComment: echo("<!-- $1 -->" % x.charData)
-    of xmlPI: echo("<? $1 ## $2 ?>" % [x.PIName, x.PIRest])
-    of xmlElementStart: echo("<$1>" % x.elementName)
-    of xmlElementEnd: echo("</$1>" % x.elementName)
-    
-    of xmlElementOpen: echo("<$1" % x.elementName) 
-    of xmlAttribute:   
-      echo("Key: " & x.attrKey)
-      echo("Value: " & x.attrValue)
-    
-    of xmlElementClose: echo(">") 
-    of xmlCData:
-      echo("<![CDATA[$1]]>" % x.charData)
-    of xmlEntity:
-      echo("&$1;" % x.entityName)
-    of xmlSpecial:
-      echo("SPECIAL: " & x.charData)
-  close(x)
-