about summary refs log tree commit diff stats
path: root/parser.nim
diff options
context:
space:
mode:
Diffstat (limited to 'parser.nim')
-rw-r--r--parser.nim379
1 files changed, 282 insertions, 97 deletions
diff --git a/parser.nim b/parser.nim
index 3873566d..a8951368 100644
--- a/parser.nim
+++ b/parser.nim
@@ -2,6 +2,7 @@ import parsexml
 import htmlelement
 import streams
 import macros
+import unicode
 
 import twtio
 import enums
@@ -9,33 +10,56 @@ import strutils
 
 type
   ParseState = object
+    stream: Stream
     closed: bool
     parents: seq[HtmlNode]
     parsedNode: HtmlNode
+    a: string
+    attrs: seq[string]
+
+  ParseEvent =
+    enum
+    NO_EVENT, EVENT_COMMENT, EVENT_STARTELEM, EVENT_ENDELEM, EVENT_OPENELEM,
+    EVENT_CLOSEELEM, EVENT_ATTRIBUTE, EVENT_TEXT
 
 #> no I won't manually write all this down
-#> maybe todo to accept stuff other than tagtype (idk how useful that'd be)
-#still todo, it'd be very useful
-macro genEnumCase(s: string): untyped =
-  let casestmt = nnkCaseStmt.newTree() 
-  casestmt.add(ident("s"))
-  for i in low(TagType) .. high(TagType):
+#yes this is incredibly ugly
+#...but hey, so long as it works
+
+macro genEnumCase(s: string, t: typedesc) =
+  result = quote do:
+    let casestmt = nnkCaseStmt.newTree() 
+    casestmt.add(ident(`s`))
+    var first = true
+    for e in low(`t`) .. high(`t`):
+      if first:
+        first = false
+        continue
+      let ret = nnkReturnStmt.newTree()
+      ret.add(newLit(e))
+      let branch = nnkOfBranch.newTree()
+      let enumname = $e
+      let tagname = enumname.split('_')[1..^1].join("_").tolower()
+      branch.add(newLit(tagname))
+      branch.add(ret)
+      casestmt.add(branch)
     let ret = nnkReturnStmt.newTree()
-    ret.add(newLit(TagType(i)))
-    let branch = nnkOfBranch.newTree()
-    let enumname = $TagType(i)
-    let tagname = enumname.substr("TAG_".len, enumname.len - 1).tolower()
-    branch.add(newLit(tagname))
+    ret.add(newLit(low(`t`)))
+    let branch = nnkElse.newTree()
     branch.add(ret)
     casestmt.add(branch)
-  let ret = nnkReturnStmt.newTree()
-  ret.add(newLit(TAG_UNKNOWN))
-  let branch = nnkElse.newTree()
-  branch.add(ret)
-  casestmt.add(branch)
+
+macro genTagTypeCase() =
+  genEnumCase("s", TagType)
+
+macro genInputTypeCase() =
+  genEnumCase("s", InputType)
 
 func tagType(s: string): TagType =
-  genEnumCase(s)
+  genTagTypeCase
+
+func inputType(s: string): InputType =
+  genInputTypeCase
 
 func newHtmlElement(tagType: TagType, parentNode: HtmlNode): HtmlElement =
   case tagType
@@ -88,6 +112,8 @@ func newHtmlElement(tagType: TagType, parentNode: HtmlNode): HtmlElement =
     result.marginbottom = 1
   of TAG_A:
     result.islink = true
+  of TAG_INPUT:
+    HtmlInputElement(result).size = 20
   else: discard
 
   if parentNode.isElemNode():
@@ -99,32 +125,6 @@ func newHtmlElement(tagType: TagType, parentNode: HtmlNode): HtmlElement =
     result.hidden = result.hidden or parent.hidden
     result.islink = result.islink or parent.islink
 
-func toInputType*(str: string): InputType =
-  case str
-  of "button": INPUT_BUTTON
-  of "checkbox": INPUT_CHECKBOX
-  of "color": INPUT_COLOR
-  of "date": INPUT_DATE
-  of "datetime_local": INPUT_DATETIME_LOCAL
-  of "email": INPUT_EMAIL
-  of "file": INPUT_FILE
-  of "hidden": INPUT_HIDDEN
-  of "image": INPUT_IMAGE
-  of "month": INPUT_MONTH
-  of "number": INPUT_NUMBER
-  of "password": INPUT_PASSWORD
-  of "radio": INPUT_RADIO
-  of "range": INPUT_RANGE
-  of "reset": INPUT_RESET
-  of "search": INPUT_SEARCH
-  of "submit": INPUT_SUBMIT
-  of "tel": INPUT_TEL
-  of "text": INPUT_TEXT
-  of "time": INPUT_TIME
-  of "url": INPUT_URL
-  of "week": INPUT_WEEK
-  else: INPUT_UNKNOWN
-
 func toInputSize*(str: string): int =
   if str.len == 0:
     return 20
@@ -153,7 +153,7 @@ proc applyAttribute(htmlElement: HtmlElement, key: string, value: string) =
     else: discard
   of "type":
     case htmlElement.tagType
-    of TAG_INPUT: HtmlInputElement(htmlElement).itype = value.toInputType()
+    of TAG_INPUT: HtmlInputElement(htmlElement).itype = value.inputType()
     else: discard
   of "size":
     case htmlElement.tagType
@@ -162,6 +162,10 @@ proc applyAttribute(htmlElement: HtmlElement, key: string, value: string) =
   else: return
 
 proc closeNode(state: var ParseState) =
+  let node = state.parents[^1]
+  if node.childNodes.len > 0 and node.isElemNode() and HtmlElement(node).display == DISPLAY_BLOCK:
+    node.childNodes[0].openblock = true
+    node.childNodes[^1].closeblock = true
   state.parents.setLen(state.parents.len - 1)
   state.closed = true
 
@@ -169,76 +173,257 @@ proc closeSingleNodes(state: var ParseState) =
   if not state.closed and state.parents[^1].isElemNode() and HtmlElement(state.parents[^1]).tagType in SingleTagTypes:
     state.closeNode()
 
+proc applyNodeText(htmlNode: HtmlNode) =
+  htmlNode.rawtext = htmlNode.getRawText()
+  htmlNode.fmttext = htmlNode.getFmtText()
+
+proc setParent(state: var ParseState, htmlNode: HtmlNode) =
+  htmlNode.parentNode = state.parents[^1]
+  if state.parents[^1].isElemNode():
+    htmlNode.parentElement = HtmlElement(state.parents[^1])
+  if state.parents[^1].childNodes.len > 0:
+    htmlNode.previousSibling = state.parents[^1].childNodes[^1]
+    htmlNode.previousSibling.nextSibling = htmlNode
+  state.parents[^1].childNodes.add(htmlNode)
+
 proc processHtmlElement(state: var ParseState, htmlElement: HtmlElement) =
   state.closed = false
-  if state.parents[^1].childNodes.len > 0:
-    htmlElement.previousSibling = state.parents[^1].childNodes[^1]
-    htmlElement.previousSibling.nextSibling = htmlElement
-  state.parents[^1].childNodes.add(htmlElement)
+  state.setParent(htmlElement)
   state.parents.add(htmlElement)
 
-proc applyNodeText(htmlNode: HtmlNode) =
-  htmlNode.rawtext = htmlNode.getRawText()
-  htmlNode.fmttext = htmlNode.getFmtText()
+proc parsecomment(state: var ParseState) =
+  var s = ""
+  state.a = ""
+  var e = 0
+  while not state.stream.atEnd():
+    let c = cast[char](state.stream.readInt8())
+    if c > char(127):
+      s &= c
+      if s.validateUtf8() == -1:
+        state.a &= s
+        s = ""
+    else:
+      case e
+      of 0:
+        if c == '-': inc e
+      of 1:
+        if c == '-': inc e
+        else:
+          e = 0
+          state.a &= '-' & c
+      of 2:
+        if c == '>': return
+        else:
+          e = 0
+          state.a &= "--" & c
+      else: state.a &= c
+
+proc parsecdata(state: var ParseState) =
+  var s = ""
+  var e = 0
+  while not state.stream.atEnd():
+    let c = cast[char](state.stream.readInt8())
+    if c > char(127):
+      s &= c
+      if s.validateUtf8() == -1:
+        state.a &= s
+        s = ""
+    else:
+      case e
+      of 0:
+        if c == ']': inc e
+      of 1:
+        if c == ']': inc e
+        else: e = 0
+      of 2:
+        if c == '>': return
+        else: e = 0
+      else: discard
+      state.a &= c
+
+proc next(state: var ParseState): ParseEvent =
+  result = NO_EVENT
+  if state.stream.atEnd(): return result
+
+  var c = cast[char](state.stream.readInt8())
+  var cdata = false
+  var s = ""
+  state.a = ""
+  if c < char(128): #ascii
+    case c
+    of '<':
+      if state.stream.atEnd():
+        state.a = $c
+        return EVENT_TEXT
+      let d = char(state.stream.peekInt8())
+      case d
+      of '/': result = EVENT_ENDELEM
+      of '!':
+        state.a = state.stream.readStr(2)
+        case state.a
+        of "[C":
+          state.a &= state.stream.readStr(7)
+          if state.a == "[CDATA[":
+            state.parsecdata()
+            return EVENT_COMMENT
+          result = EVENT_TEXT
+        of "--":
+          state.parsecomment()
+          return EVENT_COMMENT
+        else:
+          while not state.stream.atEnd():
+            c = cast[char](state.stream.readInt8())
+            if s.len == 0 and c == '>':
+              break
+            elif c > char(127):
+              s &= c
+              if s.validateUtf8() == -1:
+                s = ""
+          return NO_EVENT
+      of Letters:
+        result = EVENT_STARTELEM
+      else:
+        result = EVENT_TEXT
+        state.a = c & d
+    of '>':
+      return EVENT_CLOSEELEM
+    else: result = EVENT_TEXT
+  else: result = EVENT_TEXT
+
+  case result
+  of EVENT_STARTELEM:
+    var atspace = false
+    var atattr = false
+    while not state.stream.atEnd():
+      c = cast[char](state.stream.peekInt8())
+      if s.len == 0 and c < char(128):
+        case c
+        of Whitespace: atspace = true
+        of '>':
+          discard state.stream.readInt8()
+          break
+        else:
+          if atspace:
+            return EVENT_OPENELEM
+          else:
+            state.a &= s
+      else:
+        if atspace:
+          return EVENT_OPENELEM
+        s &= c
+        if s.validateUtf8() == -1:
+          state.a &= s
+          s = ""
+      discard state.stream.readInt8()
+  of EVENT_ENDELEM:
+    while not state.stream.atEnd():
+      c = cast[char](state.stream.readInt8())
+      if s.len == 0 and c < char(128):
+        if c == '>': break
+        elif c in Whitespace: discard
+        else: state.a &= c
+      else:
+        s &= c
+        if s.validateUtf8() == -1:
+          state.a &= s
+          s = ""
+  of EVENT_TEXT:
+    while not state.stream.atEnd():
+      c = cast[char](state.stream.peekInt8())
+      if s.len == 0 and c < char(128):
+        if c in {'<', '>'}: break
+        state.a &= c
+      else:
+        s &= c
+        if s.validateUtf8() == -1:
+          state.a &= s
+          s = ""
+      discard state.stream.readInt8()
+  else: assert(false)
 
-#TODO honestly parsexml sucks I should just make my own
 proc nparseHtml*(inputStream: Stream): Document =
-  var x: XmlParser
-  let options = @[reportWhitespace, allowUnquotedAttribs, allowEmptyAttribs]
-  x.open(inputStream, "")
-  var state: ParseState
+  var state = ParseState(stream: inputStream)
   let document = newDocument()
   state.parents.add(document)
-  while state.parents.len > 0 and x.kind != xmlEof:
-    x.next()
-    case x.kind
-    of xmlComment: discard #TODO
-    of xmlElementStart:
-      eprint "<" & x.rawdata & ">"
+  while state.parents.len > 0 and not inputStream.atEnd():
+    let event = state.next()
+    case event
+    of EVENT_COMMENT: discard #TODO
+    of EVENT_STARTELEM:
       state.closeSingleNodes()
-      let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1])
+      let parsedNode = newHtmlElement(tagType(state.a), state.parents[^1])
       parsedNode.applyNodeText()
       state.processHtmlElement(parsedNode)
-    of xmlElementEnd:
-      eprint "</" & x.rawdata & ">"
+    of EVENT_ENDELEM:
       state.closeNode()
-    of xmlElementOpen:
-      var s = "<" & x.rawdata
+    of EVENT_OPENELEM:
       state.closeSingleNodes()
-      let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1])
-      x.next()
-      while x.kind != xmlElementClose and x.kind != xmlEof:
-        if x.kind == xmlAttribute:
-          HtmlElement(parsedNode).applyAttribute(x.rawData.tolower(), x.rawData2)
-          s &= " " & x.rawdata & "=\"" & x.rawdata2 & "\""
-        elif x.kind == xmlError:
-          HtmlElement(parsedNode).applyAttribute(x.rawData.tolower(), "")
-        elif x.kind == xmlCharData:
-          if x.rawData.strip() == "/>":
-            break
-        elif x.kind == xmlElementEnd:
-          break
-        elif x.kind == xmlElementOpen:
-          #wtf??? TODO
-          break
-        else:
-          assert(false, "wtf " & $x.kind & " " & x.rawdata) #TODO
-        x.next()
-      s &= ">"
-      eprint s
+      let parsedNode = newHtmlElement(tagType(state.a), state.parents[^1])
+      var next = state.next()
+      while next != EVENT_CLOSEELEM and not inputStream.atEnd():
+        #TODO
+        #if next == EVENT_ATTRIBUTE:
+        #  parsedNode.applyAttribute(state.a.tolower(), state.b)
+        #  s &= " " & x.rawdata & "=\"" & x.rawdata2 & "\""
+        #else:
+        #  assert(false, "wtf " & $x.kind & " " & x.rawdata) #TODO
+        next = state.next()
       parsedNode.applyNodeText()
       state.processHtmlElement(parsedNode)
-    of xmlCharData:
-      eprint x.rawdata
+    of EVENT_TEXT:
+      if unicode.strip(state.a).len == 0:
+        continue
       let textNode = new(HtmlNode)
       textNode.nodeType = NODE_TEXT
-      state.parents[^1].childNodes.add(textNode)
-      textNode.parentNode = state.parents[^1]
-      if state.parents[^1].isElemNode():
-        textNode.parentElement = HtmlElement(state.parents[^1])
-      textNode.rawtext = x.rawData
+      state.setParent(textNode)
+      textNode.rawtext = state.a
       textNode.applyNodeText()
-    of xmlEntity: discard #TODO
-    of xmlEof: break
     else: discard
   return document
+
+#old nparseHtml because I don't trust myself
+#proc nparseHtml*(inputStream: Stream): Document =
+#  var x: XmlParser
+#  let options = {reportWhitespace, allowUnquotedAttribs, allowEmptyAttribs}
+#  x.open(inputStream, "", options)
+#  var state = ParseState(stream: inputStream)
+#  let document = newDocument()
+#  state.parents.add(document)
+#  while state.parents.len > 0 and x.kind != xmlEof:
+#    #let event = state.next()
+#    x.next()
+#    case x.kind
+#    of xmlComment: discard #TODO
+#    of xmlElementStart:
+#      state.closeSingleNodes()
+#      let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1])
+#      parsedNode.applyNodeText()
+#      state.processHtmlElement(parsedNode)
+#    of xmlElementEnd:
+#      state.closeNode()
+#    of xmlElementOpen:
+#      var s = "<" & x.rawdata
+#      state.closeSingleNodes()
+#      let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1])
+#      x.next()
+#      while x.kind != xmlElementClose and x.kind != xmlEof:
+#        if x.kind == xmlAttribute:
+#          parsedNode.applyAttribute(x.rawData.tolower(), x.rawData2)
+#          s &= " " & x.rawdata & "=\"" & x.rawdata2 & "\""
+#        else:
+#          assert(false, "wtf " & $x.kind & " " & x.rawdata) #TODO
+#        x.next()
+#      s &= ">"
+#      parsedNode.applyNodeText()
+#      state.processHtmlElement(parsedNode)
+#    of xmlCharData:
+#      let textNode = new(HtmlNode)
+#      textNode.nodeType = NODE_TEXT
+#
+#      state.setParent(textNode)
+#      textNode.rawtext = x.rawData
+#      textNode.applyNodeText()
+#    of xmlEntity: discard #TODO
+#    of xmlEof: break
+#    else: discard
+#  return document