about summary refs log tree commit diff stats
path: root/src/html/parser.nim
diff options
context:
space:
mode:
Diffstat (limited to 'src/html/parser.nim')
-rw-r--r--src/html/parser.nim457
1 files changed, 457 insertions, 0 deletions
diff --git a/src/html/parser.nim b/src/html/parser.nim
new file mode 100644
index 00000000..44b31d4a
--- /dev/null
+++ b/src/html/parser.nim
@@ -0,0 +1,457 @@
+import streams
+import unicode
+import strutils
+import tables
+import json
+
+import types/enums
+import types/tagtypes
+import utils/twtstr
+import utils/radixtree
+import html/dom
+import html/entity
+
+type
+  HTMLParseState = object
+    closed: bool
+    parents: seq[Node]
+    parsedNode: Node
+    a: string
+    b: string
+    attrs: seq[string]
+    in_comment: bool
+    in_script: bool
+    in_style: bool
+    in_noscript: bool
+    in_body: bool
+    elementNode: Element
+    textNode: Text
+    commentNode: Comment
+
+func inputSize*(str: string): int =
+  if str.len == 0:
+    return 20
+  for c in str:
+    if not c.isDigit:
+      return 20
+  return str.parseInt()
+
+#w3m's getescapecmd and parse_tag, transpiled to nim and heavily modified.
+#(C) Copyright 1994-2002 by Akinori Ito
+#(C) Copyright 2002-2011 by Akinori Ito, Hironori Sakamoto, Fumitoshi Ukai
+#
+#Use, modification and redistribution of this software is hereby granted,
+#provided that this entire copyright notice is included on any copies of
+#this software and applications and derivations thereof.
+#
+#This software is provided on an "as is" basis, without warranty of any
+#kind, either expressed or implied, as to any matter including, but not
+#limited to warranty of fitness of purpose, or merchantability, or
+#results obtained from use of this software.
+proc getescapecmd(buf: string, at: var int): string =
+  var i = at
+
+  if buf[i] == '#': #num
+    inc i
+    var num: int
+    if buf[i].tolower() == 'x': #hex
+      inc i
+      if not isdigit(buf[i]):
+        at = i
+        return ""
+
+      num = hexValue(buf[i])
+      inc i
+      while i < buf.len and hexValue(buf[i]) != -1:
+        num *= 0x10
+        num += hexValue(buf[i])
+        inc i
+    else: #dec
+      if not isDigit(buf[i]):
+        at = i
+        return ""
+
+      num = decValue(buf[i])
+      inc i
+      while i < buf.len and isDigit(buf[i]):
+        num *= 10
+        num += decValue(buf[i])
+        inc i
+
+    if buf[i] == ';':
+      inc i
+    at = i
+    return $(Rune(num))
+  elif not isAlphaAscii(buf[i]):
+    return ""
+
+  var n = entityMap
+  var s = ""
+  while true:
+    s &= buf[i]
+    if not entityMap.hasPrefix(s, n):
+      break
+    let pn = n
+    n = n{s}
+    if n != pn:
+      s = ""
+    inc i
+
+  if n.leaf:
+    at = i
+    return n.value
+
+  return ""
+
+type
+  DOMParsedTag = object
+    tagid: TagType
+    attrs: Table[string, string]
+    open: bool
+
+proc parse_tag(buf: string, at: var int): DOMParsedTag =
+  var tag = DOMParsedTag()
+  tag.open = true
+
+  #Parse tag name
+  var tagname = ""
+  inc at
+  if buf[at] == '/':
+    inc at
+    tag.open = false
+    at = skipBlanks(buf, at)
+
+  while at < buf.len and not buf[at].isWhitespace() and not (tag.open and buf[at] == '/') and buf[at] != '>':
+    tagname &= buf[at].tolower()
+    at += buf.runeLenAt(at)
+
+  tag.tagid = tagType(tagname)
+  at = skipBlanks(buf, at)
+
+  while at < buf.len and buf[at] != '>':
+    var value = ""
+    var attrname = ""
+    while at < buf.len and buf[at] != '=' and not buf[at].isWhitespace() and buf[at] != '>':
+      attrname &= buf[at].tolower()
+      at += buf.runeLenAt(at)
+
+    at = skipBlanks(buf, at)
+    if buf[at] == '=':
+      inc at
+      at = skipBlanks(buf, at)
+      if at < buf.len and (buf[at] == '"' or buf[at] == '\''):
+        let startc = buf[at]
+        inc at
+        while at < buf.len and buf[at] != startc:
+          if buf[at + 1] == '&':
+            inc at
+            value &= getescapecmd(buf, at)
+          else:
+            var r: Rune
+            fastRuneAt(buf, at, r)
+            value &= r
+        if at < buf.len:
+          inc at
+      elif at < buf.len:
+        while at < buf.len and not buf[at].isWhitespace() and buf[at] != '>':
+          value &= buf[at]
+          at += buf.runeLenAt(at)
+
+    if attrname.len > 0:
+      tag.attrs[attrname] = value
+
+  while at < buf.len and buf[at] != '>':
+    at += buf.runeLenAt(at)
+
+  if at < buf.len and buf[at] == '>':
+    inc at
+  return tag
+
+proc insertNode(parent: Node, node: Node) =
+  parent.childNodes.add(node)
+
+  if parent.childNodes.len > 1:
+    let prevSibling = parent.childNodes[^1]
+    prevSibling.nextSibling = node
+    node.previousSibling = prevSibling
+
+  node.parentNode = parent
+  if parent.nodeType == ELEMENT_NODE:
+    node.parentElement = (Element)parent
+
+  if parent.ownerDocument != nil:
+    node.ownerDocument = parent.ownerDocument
+  elif parent.nodeType == DOCUMENT_NODE:
+    node.ownerDocument = (Document)parent
+
+  if node.nodeType == ELEMENT_NODE:
+    parent.children.add((Element)node)
+
+    let element = ((Element)node)
+    if element.ownerDocument != nil:
+      node.ownerDocument.all_elements.add((Element)node)
+      element.ownerDocument.type_elements[element.tagType].add(element)
+      if element.id != "":
+        if not (element.id in element.ownerDocument.id_elements):
+          element.ownerDocument.id_elements[element.id] = newSeq[Element]()
+        element.ownerDocument.id_elements[element.id].add(element)
+
+      for c in element.classList:
+        if not (c in element.ownerDocument.class_elements):
+          element.ownerDocument.class_elements[c] = newSeq[Element]()
+        element.ownerDocument.class_elements[c].add(element)
+
+proc processDocumentBody(state: var HTMLParseState) =
+  if not state.in_body:
+    state.in_body = true
+    if state.elementNode.ownerDocument != nil:
+      state.elementNode = state.elementNode.ownerDocument.body
+
+proc processDocumentAddNode(state: var HTMLParseState, newNode: Node) =
+  if state.elementNode.nodeType == ELEMENT_NODE and state.elementNode.tagType == TAG_HTML:
+    if state.in_body:
+      state.elementNode = state.elementNode.ownerDocument.body
+    else:
+      state.elementNode = state.elementNode.ownerDocument.head
+
+  insertNode(state.elementNode, newNode)
+
+proc processDocumentEndNode(state: var HTMLParseState) =
+  if state.elementNode == nil or state.elementNode.nodeType == DOCUMENT_NODE:
+    return
+  state.elementNode = state.elementNode.parentElement
+
+proc processDocumentText(state: var HTMLParseState) =
+  if state.textNode != nil and state.textNode.data.len > 0:
+    processDocumentBody(state)
+  if state.textNode == nil:
+    state.textNode = newText()
+
+    processDocumentAddNode(state, state.textNode)
+
+proc processDocumentStartElement(state: var HTMLParseState, element: Element, tag: DOMParsedTag) =
+  var add = true
+
+  for k, v in tag.attrs:
+    element.attributes[k] = element.newAttr(k, v)
+  
+  element.id = element.getAttrValue("id")
+  if element.attributes.hasKey("class"):
+    for w in unicode.split(element.attributes["class"].value, Rune(' ')):
+      element.classList.add(w)
+
+  case element.tagType
+  of TAG_SCRIPT:
+    state.in_script = true
+  of TAG_NOSCRIPT:
+    state.in_noscript = true
+  of TAG_STYLE:
+    state.in_style = true
+  of TAG_SELECT:
+    HTMLSelectElement(element).name = element.getAttrValue("name")
+    HTMLSelectElement(element).value = element.getAttrValue("value")
+  of TAG_INPUT:
+    HTMLInputElement(element).value = element.getAttrValue("value")
+    HTMLInputElement(element).itype = element.getAttrValue("type").inputType()
+    HTMLInputElement(element).size = element.getAttrValue("size").inputSize()
+  of TAG_A:
+    HTMLAnchorElement(element).href = element.getAttrValue("href")
+  of TAG_OPTION:
+    HTMLOptionElement(element).value = element.getAttrValue("href")
+  of TAG_HTML:
+    add = false
+  of TAG_HEAD:
+    add = false
+  of TAG_BODY:
+    add = false
+    processDocumentBody(state)
+  else: discard
+
+  if state.elementNode.nodeType == ELEMENT_NODE:
+    case element.tagType
+    of SelfClosingTagTypes:
+      if state.elementNode.tagType == element.tagType:
+        processDocumentEndNode(state)
+    of TAG_H1:
+      HTMLHeadingElement(element).rank = 1
+    of TAG_H2:
+      HTMLHeadingElement(element).rank = 2
+    of TAG_H3:
+      HTMLHeadingElement(element).rank = 3
+    of TAG_H4:
+      HTMLHeadingElement(element).rank = 4
+    of TAG_H5:
+      HTMLHeadingElement(element).rank = 5
+    of TAG_H6:
+      HTMLHeadingElement(element).rank = 6
+    else: discard
+
+    if state.elementNode.tagType == TAG_P and element.tagType in PClosingTagTypes:
+      processDocumentEndNode(state)
+
+  if add:
+    processDocumentAddNode(state, element)
+    state.elementNode = element
+
+  if element.tagType in VoidTagTypes:
+    processDocumentEndNode(state)
+
+proc processDocumentEndElement(state: var HTMLParseState, tag: DOMParsedTag) =
+  if tag.tagid in VoidTagTypes:
+    return
+  if tag.tagid == TAG_HEAD:
+    state.in_body = true
+    return
+  if tag.tagid == TAG_BODY:
+    return
+  if state.elementNode.nodeType == ELEMENT_NODE and tag.tagid != state.elementNode.tagType:
+    if state.elementNode.tagType in SelfClosingTagTypes:
+      processDocumentEndNode(state)
+  
+  processDocumentEndNode(state)
+
+proc processDocumentTag(state: var HTMLParseState, tag: DOMParsedTag) =
+  if state.in_script:
+    if tag.tagid == TAG_SCRIPT:
+      state.in_script = false
+    else:
+      return
+
+  if state.in_style:
+    if tag.tagid == TAG_STYLE:
+      state.in_style = false
+    else:
+      return
+
+  if state.in_noscript:
+    if tag.tagid == TAG_NOSCRIPT:
+      state.in_noscript = false
+    else:
+      return
+
+  if tag.open:
+    processDocumentStartElement(state, newHtmlElement(tag.tagid), tag)
+  else:
+    processDocumentEndElement(state, tag)
+
+proc processDocumentPart(state: var HTMLParseState, buf: string) =
+  var at = 0
+  var max = 0
+  var was_script = false
+
+  max = buf.len
+
+  while at < max:
+    case buf[at]
+    of '&':
+      inc at
+      let p = getescapecmd(buf, at)
+      if state.in_comment:
+        state.commentNode.data &= p
+      else:
+        processDocumentText(state)
+        state.textNode.data &= p
+    of '<':
+      if state.in_comment:
+        state.commentNode.data &= buf[at]
+        inc at
+      else:
+        var p = at
+        inc p
+        if p < max and buf[p] == '!':
+          inc p
+          if p < max and buf[p] == '-':
+            inc p
+            if p < max and buf[p] == '-':
+              inc p
+              at = p
+              state.in_comment = true
+              let comment = newComment()
+              state.commentNode = comment
+              processDocumentAddNode(state, comment)
+              if state.textNode != nil:
+                state.textNode.rawtext = state.textNode.getRawText()
+                state.textNode = nil
+          else:
+            #TODO for doctype
+            while p < max and buf[p] != '>':
+              inc p
+            at = p + 1
+            continue
+
+        if not state.in_comment:
+          if state.textNode != nil:
+            state.textNode.rawtext = state.textNode.getRawText()
+            state.textNode = nil
+          p = at
+          var tag = parse_tag(buf, at)
+          was_script = state.in_script
+
+          processDocumentTag(state, tag)
+#         if (was_script) {
+#             if (state->in_script) {
+#                 ptr = p;
+#                 processDocumentText(&state->parentNode, &state->textNode);
+#                 Strcat_char(((CharacterData *)state->textNode)->data, *ptr++);
+#             } else if (buffer->javascript_enabled) {
+#                 loadJSToBuffer(buffer, childTextContentNode(state->parentNode->lastChild)->ptr, "<inline>", state->document);
+#             }
+#         }
+    elif buf[at] == '-' and state.in_comment:
+      var p = at
+      inc p
+      if p < max and buf[p] == '-':
+        inc p
+        if p < max and buf[p] == '>':
+          inc p
+          at = p
+          state.commentNode = nil
+          state.in_comment = false
+
+      if state.in_comment:
+        state.commentNode.data &= buf[at]
+        inc at
+    else:
+      var r: Rune
+      fastRuneAt(buf, at, r)
+      if state.in_comment:
+        state.commentNode.data &= $r
+      else:
+        processDocumentText(state)
+        state.textNode.data &= $r
+
+proc parseHtml*(inputStream: Stream): Document =
+  let document = newDocument()
+  insertNode(document, document.root)
+  insertNode(document.root, document.head)
+  insertNode(document.root, document.body)
+
+  var state = HTMLParseState()
+  state.elementNode = document.root
+
+  var till_when = false
+
+  var buf = ""
+  var lineBuf: string
+  while not inputStream.atEnd():
+    lineBuf = inputStream.readLine()
+    buf &= lineBuf
+
+    var at = 0
+    while at < lineBuf.len:
+      case lineBuf[at]
+      of '<':
+        till_when = true
+      of '>':
+        till_when = false
+      else: discard
+      at += lineBuf.runeLenAt(at)
+
+    if till_when:
+      continue
+
+    processDocumentPart(state, buf)
+    buf = ""
+
+  inputStream.close()
+  return document