diff options
Diffstat (limited to 'parser.nim')
-rw-r--r-- | parser.nim | 379 |
1 files changed, 282 insertions, 97 deletions
diff --git a/parser.nim b/parser.nim index 3873566d..a8951368 100644 --- a/parser.nim +++ b/parser.nim @@ -2,6 +2,7 @@ import parsexml import htmlelement import streams import macros +import unicode import twtio import enums @@ -9,33 +10,56 @@ import strutils type ParseState = object + stream: Stream closed: bool parents: seq[HtmlNode] parsedNode: HtmlNode + a: string + attrs: seq[string] + + ParseEvent = + enum + NO_EVENT, EVENT_COMMENT, EVENT_STARTELEM, EVENT_ENDELEM, EVENT_OPENELEM, + EVENT_CLOSEELEM, EVENT_ATTRIBUTE, EVENT_TEXT #> no I won't manually write all this down -#> maybe todo to accept stuff other than tagtype (idk how useful that'd be) -#still todo, it'd be very useful -macro genEnumCase(s: string): untyped = - let casestmt = nnkCaseStmt.newTree() - casestmt.add(ident("s")) - for i in low(TagType) .. high(TagType): +#yes this is incredibly ugly +#...but hey, so long as it works + +macro genEnumCase(s: string, t: typedesc) = + result = quote do: + let casestmt = nnkCaseStmt.newTree() + casestmt.add(ident(`s`)) + var first = true + for e in low(`t`) .. high(`t`): + if first: + first = false + continue + let ret = nnkReturnStmt.newTree() + ret.add(newLit(e)) + let branch = nnkOfBranch.newTree() + let enumname = $e + let tagname = enumname.split('_')[1..^1].join("_").tolower() + branch.add(newLit(tagname)) + branch.add(ret) + casestmt.add(branch) let ret = nnkReturnStmt.newTree() - ret.add(newLit(TagType(i))) - let branch = nnkOfBranch.newTree() - let enumname = $TagType(i) - let tagname = enumname.substr("TAG_".len, enumname.len - 1).tolower() - branch.add(newLit(tagname)) + ret.add(newLit(low(`t`))) + let branch = nnkElse.newTree() branch.add(ret) casestmt.add(branch) - let ret = nnkReturnStmt.newTree() - ret.add(newLit(TAG_UNKNOWN)) - let branch = nnkElse.newTree() - branch.add(ret) - casestmt.add(branch) + +macro genTagTypeCase() = + genEnumCase("s", TagType) + +macro genInputTypeCase() = + genEnumCase("s", InputType) func tagType(s: string): TagType = - genEnumCase(s) + genTagTypeCase + +func inputType(s: string): InputType = + genInputTypeCase func newHtmlElement(tagType: TagType, parentNode: HtmlNode): HtmlElement = case tagType @@ -88,6 +112,8 @@ func newHtmlElement(tagType: TagType, parentNode: HtmlNode): HtmlElement = result.marginbottom = 1 of TAG_A: result.islink = true + of TAG_INPUT: + HtmlInputElement(result).size = 20 else: discard if parentNode.isElemNode(): @@ -99,32 +125,6 @@ func newHtmlElement(tagType: TagType, parentNode: HtmlNode): HtmlElement = result.hidden = result.hidden or parent.hidden result.islink = result.islink or parent.islink -func toInputType*(str: string): InputType = - case str - of "button": INPUT_BUTTON - of "checkbox": INPUT_CHECKBOX - of "color": INPUT_COLOR - of "date": INPUT_DATE - of "datetime_local": INPUT_DATETIME_LOCAL - of "email": INPUT_EMAIL - of "file": INPUT_FILE - of "hidden": INPUT_HIDDEN - of "image": INPUT_IMAGE - of "month": INPUT_MONTH - of "number": INPUT_NUMBER - of "password": INPUT_PASSWORD - of "radio": INPUT_RADIO - of "range": INPUT_RANGE - of "reset": INPUT_RESET - of "search": INPUT_SEARCH - of "submit": INPUT_SUBMIT - of "tel": INPUT_TEL - of "text": INPUT_TEXT - of "time": INPUT_TIME - of "url": INPUT_URL - of "week": INPUT_WEEK - else: INPUT_UNKNOWN - func toInputSize*(str: string): int = if str.len == 0: return 20 @@ -153,7 +153,7 @@ proc applyAttribute(htmlElement: HtmlElement, key: string, value: string) = else: discard of "type": case htmlElement.tagType - of TAG_INPUT: HtmlInputElement(htmlElement).itype = value.toInputType() + of TAG_INPUT: HtmlInputElement(htmlElement).itype = value.inputType() else: discard of "size": case htmlElement.tagType @@ -162,6 +162,10 @@ proc applyAttribute(htmlElement: HtmlElement, key: string, value: string) = else: return proc closeNode(state: var ParseState) = + let node = state.parents[^1] + if node.childNodes.len > 0 and node.isElemNode() and HtmlElement(node).display == DISPLAY_BLOCK: + node.childNodes[0].openblock = true + node.childNodes[^1].closeblock = true state.parents.setLen(state.parents.len - 1) state.closed = true @@ -169,76 +173,257 @@ proc closeSingleNodes(state: var ParseState) = if not state.closed and state.parents[^1].isElemNode() and HtmlElement(state.parents[^1]).tagType in SingleTagTypes: state.closeNode() +proc applyNodeText(htmlNode: HtmlNode) = + htmlNode.rawtext = htmlNode.getRawText() + htmlNode.fmttext = htmlNode.getFmtText() + +proc setParent(state: var ParseState, htmlNode: HtmlNode) = + htmlNode.parentNode = state.parents[^1] + if state.parents[^1].isElemNode(): + htmlNode.parentElement = HtmlElement(state.parents[^1]) + if state.parents[^1].childNodes.len > 0: + htmlNode.previousSibling = state.parents[^1].childNodes[^1] + htmlNode.previousSibling.nextSibling = htmlNode + state.parents[^1].childNodes.add(htmlNode) + proc processHtmlElement(state: var ParseState, htmlElement: HtmlElement) = state.closed = false - if state.parents[^1].childNodes.len > 0: - htmlElement.previousSibling = state.parents[^1].childNodes[^1] - htmlElement.previousSibling.nextSibling = htmlElement - state.parents[^1].childNodes.add(htmlElement) + state.setParent(htmlElement) state.parents.add(htmlElement) -proc applyNodeText(htmlNode: HtmlNode) = - htmlNode.rawtext = htmlNode.getRawText() - htmlNode.fmttext = htmlNode.getFmtText() +proc parsecomment(state: var ParseState) = + var s = "" + state.a = "" + var e = 0 + while not state.stream.atEnd(): + let c = cast[char](state.stream.readInt8()) + if c > char(127): + s &= c + if s.validateUtf8() == -1: + state.a &= s + s = "" + else: + case e + of 0: + if c == '-': inc e + of 1: + if c == '-': inc e + else: + e = 0 + state.a &= '-' & c + of 2: + if c == '>': return + else: + e = 0 + state.a &= "--" & c + else: state.a &= c + +proc parsecdata(state: var ParseState) = + var s = "" + var e = 0 + while not state.stream.atEnd(): + let c = cast[char](state.stream.readInt8()) + if c > char(127): + s &= c + if s.validateUtf8() == -1: + state.a &= s + s = "" + else: + case e + of 0: + if c == ']': inc e + of 1: + if c == ']': inc e + else: e = 0 + of 2: + if c == '>': return + else: e = 0 + else: discard + state.a &= c + +proc next(state: var ParseState): ParseEvent = + result = NO_EVENT + if state.stream.atEnd(): return result + + var c = cast[char](state.stream.readInt8()) + var cdata = false + var s = "" + state.a = "" + if c < char(128): #ascii + case c + of '<': + if state.stream.atEnd(): + state.a = $c + return EVENT_TEXT + let d = char(state.stream.peekInt8()) + case d + of '/': result = EVENT_ENDELEM + of '!': + state.a = state.stream.readStr(2) + case state.a + of "[C": + state.a &= state.stream.readStr(7) + if state.a == "[CDATA[": + state.parsecdata() + return EVENT_COMMENT + result = EVENT_TEXT + of "--": + state.parsecomment() + return EVENT_COMMENT + else: + while not state.stream.atEnd(): + c = cast[char](state.stream.readInt8()) + if s.len == 0 and c == '>': + break + elif c > char(127): + s &= c + if s.validateUtf8() == -1: + s = "" + return NO_EVENT + of Letters: + result = EVENT_STARTELEM + else: + result = EVENT_TEXT + state.a = c & d + of '>': + return EVENT_CLOSEELEM + else: result = EVENT_TEXT + else: result = EVENT_TEXT + + case result + of EVENT_STARTELEM: + var atspace = false + var atattr = false + while not state.stream.atEnd(): + c = cast[char](state.stream.peekInt8()) + if s.len == 0 and c < char(128): + case c + of Whitespace: atspace = true + of '>': + discard state.stream.readInt8() + break + else: + if atspace: + return EVENT_OPENELEM + else: + state.a &= s + else: + if atspace: + return EVENT_OPENELEM + s &= c + if s.validateUtf8() == -1: + state.a &= s + s = "" + discard state.stream.readInt8() + of EVENT_ENDELEM: + while not state.stream.atEnd(): + c = cast[char](state.stream.readInt8()) + if s.len == 0 and c < char(128): + if c == '>': break + elif c in Whitespace: discard + else: state.a &= c + else: + s &= c + if s.validateUtf8() == -1: + state.a &= s + s = "" + of EVENT_TEXT: + while not state.stream.atEnd(): + c = cast[char](state.stream.peekInt8()) + if s.len == 0 and c < char(128): + if c in {'<', '>'}: break + state.a &= c + else: + s &= c + if s.validateUtf8() == -1: + state.a &= s + s = "" + discard state.stream.readInt8() + else: assert(false) -#TODO honestly parsexml sucks I should just make my own proc nparseHtml*(inputStream: Stream): Document = - var x: XmlParser - let options = @[reportWhitespace, allowUnquotedAttribs, allowEmptyAttribs] - x.open(inputStream, "") - var state: ParseState + var state = ParseState(stream: inputStream) let document = newDocument() state.parents.add(document) - while state.parents.len > 0 and x.kind != xmlEof: - x.next() - case x.kind - of xmlComment: discard #TODO - of xmlElementStart: - eprint "<" & x.rawdata & ">" + while state.parents.len > 0 and not inputStream.atEnd(): + let event = state.next() + case event + of EVENT_COMMENT: discard #TODO + of EVENT_STARTELEM: state.closeSingleNodes() - let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1]) + let parsedNode = newHtmlElement(tagType(state.a), state.parents[^1]) parsedNode.applyNodeText() state.processHtmlElement(parsedNode) - of xmlElementEnd: - eprint "</" & x.rawdata & ">" + of EVENT_ENDELEM: state.closeNode() - of xmlElementOpen: - var s = "<" & x.rawdata + of EVENT_OPENELEM: state.closeSingleNodes() - let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1]) - x.next() - while x.kind != xmlElementClose and x.kind != xmlEof: - if x.kind == xmlAttribute: - HtmlElement(parsedNode).applyAttribute(x.rawData.tolower(), x.rawData2) - s &= " " & x.rawdata & "=\"" & x.rawdata2 & "\"" - elif x.kind == xmlError: - HtmlElement(parsedNode).applyAttribute(x.rawData.tolower(), "") - elif x.kind == xmlCharData: - if x.rawData.strip() == "/>": - break - elif x.kind == xmlElementEnd: - break - elif x.kind == xmlElementOpen: - #wtf??? TODO - break - else: - assert(false, "wtf " & $x.kind & " " & x.rawdata) #TODO - x.next() - s &= ">" - eprint s + let parsedNode = newHtmlElement(tagType(state.a), state.parents[^1]) + var next = state.next() + while next != EVENT_CLOSEELEM and not inputStream.atEnd(): + #TODO + #if next == EVENT_ATTRIBUTE: + # parsedNode.applyAttribute(state.a.tolower(), state.b) + # s &= " " & x.rawdata & "=\"" & x.rawdata2 & "\"" + #else: + # assert(false, "wtf " & $x.kind & " " & x.rawdata) #TODO + next = state.next() parsedNode.applyNodeText() state.processHtmlElement(parsedNode) - of xmlCharData: - eprint x.rawdata + of EVENT_TEXT: + if unicode.strip(state.a).len == 0: + continue let textNode = new(HtmlNode) textNode.nodeType = NODE_TEXT - state.parents[^1].childNodes.add(textNode) - textNode.parentNode = state.parents[^1] - if state.parents[^1].isElemNode(): - textNode.parentElement = HtmlElement(state.parents[^1]) - textNode.rawtext = x.rawData + state.setParent(textNode) + textNode.rawtext = state.a textNode.applyNodeText() - of xmlEntity: discard #TODO - of xmlEof: break else: discard return document + +#old nparseHtml because I don't trust myself +#proc nparseHtml*(inputStream: Stream): Document = +# var x: XmlParser +# let options = {reportWhitespace, allowUnquotedAttribs, allowEmptyAttribs} +# x.open(inputStream, "", options) +# var state = ParseState(stream: inputStream) +# let document = newDocument() +# state.parents.add(document) +# while state.parents.len > 0 and x.kind != xmlEof: +# #let event = state.next() +# x.next() +# case x.kind +# of xmlComment: discard #TODO +# of xmlElementStart: +# state.closeSingleNodes() +# let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1]) +# parsedNode.applyNodeText() +# state.processHtmlElement(parsedNode) +# of xmlElementEnd: +# state.closeNode() +# of xmlElementOpen: +# var s = "<" & x.rawdata +# state.closeSingleNodes() +# let parsedNode = newHtmlElement(tagType(x.rawData), state.parents[^1]) +# x.next() +# while x.kind != xmlElementClose and x.kind != xmlEof: +# if x.kind == xmlAttribute: +# parsedNode.applyAttribute(x.rawData.tolower(), x.rawData2) +# s &= " " & x.rawdata & "=\"" & x.rawdata2 & "\"" +# else: +# assert(false, "wtf " & $x.kind & " " & x.rawdata) #TODO +# x.next() +# s &= ">" +# parsedNode.applyNodeText() +# state.processHtmlElement(parsedNode) +# of xmlCharData: +# let textNode = new(HtmlNode) +# textNode.nodeType = NODE_TEXT +# +# state.setParent(textNode) +# textNode.rawtext = x.rawData +# textNode.applyNodeText() +# of xmlEntity: discard #TODO +# of xmlEof: break +# else: discard +# return document |