summary refs log tree commit diff stats
path: root/lib/pure/htmlparser.nim
blob: df840e15c1929112a21efded8c098d416bbd4728 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2010 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## This module parses an HTML document and creates its XML tree representation.
## It is supposed to handle the *wild* HTML the real world uses.
## 
## It can be used to parse a wild HTML document and output it as valid XHTML
## document (if you are lucky):
##
## .. code-block:: nimrod
##
##   echo loadHtml("mydirty.html")
##
##
## Every tag in the resulting tree is in lower case.
##
## **Note:** The resulting ``PXmlNode``s already use the ``clientData`` field, 
## so it cannot be used by clients of this library.

import streams, parsexml, xmltree

type
  THtmlTag* = enum ## list of all supported HTML tags; order will always be
                   ## alphabetically
    tagUnknown,    ## unknown HTML element
    tagA,          ## the HTML ``a`` element
    tagAcronym,    ## the HTML ``acronym`` element
    tagAddress,    ## the HTML ``address`` element
    tagArea,       ## the HTML ``area`` element
    tagB,          ## the HTML ``b`` element
    tagBase,       ## the HTML ``base`` element
    tagBig,        ## the HTML ``big`` element
    tagBlockquote, ## the HTML ``blockquote`` element
    tagBody,       ## the HTML ``body`` element
    tagBr,         ## the HTML ``br`` element
    tagButton,     ## the HTML ``button`` element
    tagCaption,    ## the HTML ``caption`` element
    tagCite,       ## the HTML ``cite`` element
    tagCode,       ## the HTML ``code`` element
    tagCol,        ## the HTML ``col`` element
    tagColgroup,   ## the HTML ``colgroup`` element
    tagDd,         ## the HTML ``dd`` element
    tagDel,        ## the HTML ``del`` element
    tagDfn,        ## the HTML ``dfn`` element
    tagDiv,        ## the HTML ``div`` element
    tagDl,         ## the HTML ``dl`` element
    tagDt,         ## the HTML ``dt`` element
    tagEm,         ## the HTML ``em`` element
    tagFieldset,   ## the HTML ``fieldset`` element
    tagForm,       ## the HTML ``form`` element
    tagH1,         ## the HTML ``h1`` element
    tagH2,         ## the HTML ``h2`` element
    tagH3,         ## the HTML ``h3`` element
    tagH4,         ## the HTML ``h4`` element
    tagH5,         ## the HTML ``h5`` element
    tagH6,         ## the HTML ``h6`` element
    tagHead,       ## the HTML ``head`` element
    tagHtml,       ## the HTML ``html`` element
    tagHr,         ## the HTML ``hr`` element
    tagI,          ## the HTML ``i`` element
    tagImg,        ## the HTML ``img`` element
    tagInput,      ## the HTML ``input`` element
    tagIns,        ## the HTML ``ins`` element
    tagKbd,        ## the HTML ``kbd`` element
    tagLabel,      ## the HTML ``label`` element
    tagLegend,     ## the HTML ``legend`` element
    tagLi,         ## the HTML ``li`` element
    tagLink,       ## the HTML ``link`` element
    tagMap,        ## the HTML ``map`` element
    tagMeta,       ## the HTML ``meta`` element
    tagNoscript,   ## the HTML ``noscript`` element
    tagObject,     ## the HTML ``object`` element
    tagOl,         ## the HTML ``ol`` element
    tagOptgroup,   ## the HTML ``optgroup`` element
    tagOption,     ## the HTML ``option`` element
    tagP,          ## the HTML ``p`` element
    tagParam,      ## the HTML ``param`` element
    tagPre,        ## the HTML ``pre`` element
    tagQ,          ## the HTML ``q`` element
    tagSamp,       ## the HTML ``samp`` element
    tagScript,     ## the HTML ``script`` element
    tagSelect,     ## the HTML ``select`` element
    tagSmall,      ## the HTML ``small`` element
    tagSpan,       ## the HTML ``span`` element
    tagStrong,     ## the HTML ``strong`` element
    tagStyle,      ## the HTML ``style`` element
    tagSub,        ## the HTML ``sub`` element
    tagSup,        ## the HTML ``sup`` element
    tagTable,      ## the HTML ``table`` element
    tagTbody,      ## the HTML ``tbody`` element
    tagTd,         ## the HTML ``td`` element
    tagTextarea,   ## the HTML ``textarea`` element
    tagTfoot,      ## the HTML ``tfoot`` element
    tagTh,         ## the HTML ``th`` element
    tagThead,      ## the HTML ``thead`` element
    tagTitle,      ## the HTML ``title`` element
    tagTr,         ## the HTML ``tr`` element
    tagTt,         ## the HTML ``tt`` element
    tagUl,         ## the HTML ``ul`` element
    tagVar         ## the HTML ``var`` element

const 
  tagStrs = [
    "a", "acronym", "address", "area", "b", "base", "big", "blockquote", 
    "body", "br", "button", "caption", "cite", "code", "col", "colgroup", 
    "dd", "del", "dfn", "div", "dl", "dt", "em", "fieldset", 
    "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr", 
    "i", "img", "input", "ins", "kbd", "label", "legend", "li", "link", 
    "map", "meta", "noscript", "object", "ol", "optgroup", "option", 
    "p", "param", "pre", "q", "samp", "script", "select", "small", 
    "span", "strong", "style", "sub", "sup", "table", "tbody", "td", 
    "textarea", "tfoot", "th", "thead", "title", "tr", "tt", "ul", "var"
  ]

proc binaryStrSearch(x: openarray[string], y: string): int = 
  ## XXX put this into the library somewhere!
  var a = 0
  var b = len(x) - 1
  while a <= b: 
    var mid = (a + b) div 2
    var c = cmp(x[mid], y)
    if c < 0: 
      a = mid + 1
    elif c > 0: 
      b = mid - 1
    else: 
      return mid
  result = - 1

proc htmlTag*(n: PXmlNode): THtmlTag = 
  ## gets `n`'s tag as a ``THtmlTag``. Even though results are cached, this is
  ## can be more expensive than comparing ``tag`` directly to a string.
  if n.clientData == 0:
    n.clientData = binaryStrSearch(tagStrs, n.tag)+1
  result = THtmlTag(n.clientData)

proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
  var n = doc.createElement("")

  while True:
    case x.kind()
    of xmlEof:
      break
    of xmlElementStart:
      if n.tagName() != "":
        n.appendChild(parseElement(x, doc))
      else:
        n = doc.createElement(x.elementName)
    of xmlElementOpen:
      if n.tagName() != "":
        n.appendChild(parseElement(x, doc))
      else:
        if x.elementName.contains(':'):
          #TODO: NamespaceURI
          n = doc.createElementNS("nil", x.elementName)
        else:  
          n = doc.createElement(x.elementName)
        
    of xmlElementEnd:
      if x.elementName == n.nodeName:
        # n.normalize() # Remove any whitespace etc.
        return n
      else: #The wrong element is ended
        raise newException(EMismatchedTag, "Mismatched tag at line " & 
          $x.getLine() & " column " & $x.getColumn)
      
    of xmlCharData:
      n.appendChild(parseText(x, doc))
    of xmlAttribute:
      if x.attrKey.contains(':'):
        #TODO: NamespaceURI
        n.setAttributeNS("nil", x.attrKey, x.attrValue)
      else:
        n.setAttribute(x.attrKey, x.attrValue)
    of xmlCData:
      n.appendChild(doc.createCDATASection(x.charData()))
    of xmlComment:
      n.appendChild(doc.createComment(x.charData()))
    of xmlPI:
      n.appendChild(doc.createProcessingInstruction(x.PIName(), x.PIRest()))
      
    of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial:
      # Unused 'events'

    else:
      raise newException(EParserError, "Unexpected XML Parser event")
    x.next()

  raise newException(EMismatchedTag, 
    "Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn)


proc parse*(x: var TXmlParser, father: PXmlNode) =
  

proc parseHtml*(s: PStream, filename: string, 
                errors: var seq[string]): PXmlNode = 
  ## parses the HTML from stream `s` and returns a ``PXmlNode``. Every
  ## occured parsing error is added to the `errors` sequence.
  var x: TXmlParser
  open(x, s, filename, {reportComments})
  
  result = newElement("html")
  while true:
    x.next()
    case x.kind
    of xmlWhitespace: nil # just skip it
    of xmlComment: 
      result.add(newComment(x.text))
  
  while True:
    x.next()
    case x.kind
    of xmlEof: break
    of xmlElementStart, xmlElementOpen:
      var el: PElement = parseElement(x, XmlDoc)
      XmlDoc = dom.createDocument(el)
    of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial:
      # Unused 'events'
    else:
      raise newException(EParserError, "Unexpected XML Parser event")
  close(x)

proc parseHtml*(s: PStream): PXmlNode = 
  ## parses the HTML from stream `s` and returns a ``PXmlNode``. All parsing
  ## errors are ignored.
  var errors: seq[string] = @[]
  result = parseHtml(s, "unknown_html_doc", errors)

proc loadHtml*(path: string, reportErrors = false): PXmlNode = 
  ## Loads and parses HTML from file specified by ``path``, and returns 
  ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
  ## ``echo``ed.
  var s = newFileStream(path, fmRead)
  if s == nil: raise newException(EIO, "Unable to read file: " & path)
  
  var errors: seq[string] = @[]
  result = parseHtml(s, path, errors)
  if reportErrors: 
    for msg in items(errors): echo(msg)