diff options
author | bptato <nincsnevem662@gmail.com> | 2023-12-27 14:18:25 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-12-27 14:18:25 +0100 |
commit | 3053c28d837b51f0348cb0cc036ac01557431ef2 (patch) | |
tree | 876396c84a5cbb1ccf8ec99332fd0cb8d26a4ea5 | |
parent | e89c696c83735b01e8c76d33730ab0e419d53ff4 (diff) | |
download | chawan-3053c28d837b51f0348cb0cc036ac01557431ef2.tar.gz |
Add string interning support
WIP
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | chame/atoms.nim | 14 | ||||
-rw-r--r-- | chame/htmlparser.nim | 482 | ||||
-rw-r--r-- | chame/htmltokenizer.nim | 203 | ||||
-rw-r--r-- | chame/minidom.nim | 148 | ||||
-rw-r--r-- | chame/minidom_cs.nim | 28 | ||||
-rw-r--r-- | chame/tags.nim | 178 | ||||
-rw-r--r-- | tests/shared/tree_common.nim | 21 | ||||
-rw-r--r-- | tests/test1.nim | 14 | ||||
-rw-r--r-- | tests/tokenizer.nim | 57 | ||||
-rw-r--r-- | tests/tree.nim | 17 | ||||
-rw-r--r-- | tests/tree_charset.nim | 14 |
12 files changed, 694 insertions, 484 deletions
diff --git a/README.md b/README.md index 968a96a6..d1a958c0 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ description of the API. * Includes a minimal DOM implementation. * No mandatory dependencies other than the Nim standard library. * Optional character encoding support (see minidom_enc). +* String interning support for tag names. ## To-do @@ -38,7 +39,6 @@ is planned, even if source code comments say otherwise. Other, non-standard-related tasks (in no particular order): -* Use string interning for tag/attribute local names. * Finish integration of html5lib-tests. * Optimize inefficient parts of the library. diff --git a/chame/atoms.nim b/chame/atoms.nim new file mode 100644 index 00000000..406348b8 --- /dev/null +++ b/chame/atoms.nim @@ -0,0 +1,14 @@ +import tags + +type + AtomFactory*[Atom] = ref object of RootObj + strToAtomImpl*: AtomFactoryStrToAtom[Atom] ## Must never be nil. + tagTypeToAtomImpl*: AtomFactoryTagTypeToAtom[Atom] ## Must never be nil. + + AtomFactoryStrToAtom*[Atom] = + proc(factory: AtomFactory[Atom], s: string): Atom {.nimcall.} + ## Turn a string `s` into an atom. + + AtomFactoryTagTypeToAtom*[Atom] = + proc(factory: AtomFactory[Atom], t: TagType): Atom {.nimcall.} + ## Turn a tagType `t` into an atom. diff --git a/chame/htmlparser.nim b/chame/htmlparser.nim index 5651b4d3..fa00b5d9 100644 --- a/chame/htmlparser.nim +++ b/chame/htmlparser.nim @@ -4,6 +4,7 @@ import std/streams import std/strutils import std/tables +import atoms import htmltokenizer import parseerror import tags @@ -14,51 +15,51 @@ export macros # Heavily inspired by html5ever's TreeSink design. type - DOMBuilder*[Handle] = ref object of RootObj - getDocument*: DOMBuilderGetDocument[Handle] + DOMBuilder*[Handle, Atom] = ref object of RootObj + getDocument*: DOMBuilderGetDocument[Handle, Atom] ## Must never be nil. - finish*: DOMBuilderFinish[Handle] + getAtomFactory*: DOMBuilderGetAtomFactory[Handle, Atom] + ## Must never be nil. + finish*: DOMBuilderFinish[Handle, Atom] ## May be nil. - parseError*: DOMBuilderParseError[Handle] + parseError*: DOMBuilderParseError[Handle, Atom] ## May be nil. - setQuirksMode*: DOMBuilderSetQuirksMode[Handle] + setQuirksMode*: DOMBuilderSetQuirksMode[Handle, Atom] ## May be nil - setEncoding*: DOMBuilderSetEncoding[Handle] + setEncoding*: DOMBuilderSetEncoding[Handle, Atom] ## May be nil. - elementPopped*: DOMBuilderElementPopped[Handle] + elementPopped*: DOMBuilderElementPopped[Handle, Atom] ## May be nil. - getTemplateContent*: DOMBuilderGetTemplateContent[Handle] + getTemplateContent*: DOMBuilderGetTemplateContent[Handle, Atom] ## May be nil. (If nil, templates are treated as regular elements.) - getParentNode*: DOMBuilderGetParentNode[Handle] + getParentNode*: DOMBuilderGetParentNode[Handle, Atom] ## Must never be nil. - getLocalName*: DOMBuilderGetLocalName[Handle] + getLocalName*: DOMBuilderGetLocalName[Handle, Atom] ## Must never be nil. - getTagType*: DOMBuilderGetTagType[Handle] - ## May be nil. (If nil, the parser falls back to getLocalName.) - getNamespace*: DOMBuilderGetNamespace[Handle] + getNamespace*: DOMBuilderGetNamespace[Handle, Atom] ## May be nil. (If nil, the parser always uses the HTML namespace.) - createElement*: DOMBuilderCreateElement[Handle] + createElement*: DOMBuilderCreateElement[Handle, Atom] ## Must never be nil. - createComment*: DOMBuilderCreateComment[Handle] + createComment*: DOMBuilderCreateComment[Handle, Atom] ## Must never be nil. - createDocumentType*: DOMBuilderCreateDocumentType[Handle] + createDocumentType*: DOMBuilderCreateDocumentType[Handle, Atom] ## Must never be nil. - insertBefore*: DOMBuilderInsertBefore[Handle] + insertBefore*: DOMBuilderInsertBefore[Handle, Atom] ## Must never be nil. - insertText*: DOMBuilderInsertText[Handle] + insertText*: DOMBuilderInsertText[Handle, Atom] ## Must never be nil. - remove*: DOMBuilderRemove[Handle] + remove*: DOMBuilderRemove[Handle, Atom] ## Must never be nil. - moveChildren*: DOMBuilderMoveChildren[Handle] + moveChildren*: DOMBuilderMoveChildren[Handle, Atom] ## Must never be nil. - addAttrsIfMissing*: DOMBuilderAddAttrsIfMissing[Handle] + addAttrsIfMissing*: DOMBuilderAddAttrsIfMissing[Handle, Atom] ## May be nil. (If nil, some attributes may not be added to the HTML or ## BODY element if more than one of their respective opening tags exist.) - setScriptAlreadyStarted*: DOMBuilderSetScriptAlreadyStarted[Handle] + setScriptAlreadyStarted*: DOMBuilderSetScriptAlreadyStarted[Handle, Atom] ## May be nil. - associateWithForm*: DOMBuilderAssociateWithForm[Handle] + associateWithForm*: DOMBuilderAssociateWithForm[Handle, Atom] ## May be nil. - isSVGIntegrationPoint*: DOMBuilderIsSVGIntegrationPoint[Handle] + isSVGIntegrationPoint*: DOMBuilderIsSVGIntegrationPoint[Handle, Atom] ## May be nil. (If nil, the parser considers no Handle an SVG integration ## point.) @@ -86,29 +87,34 @@ type ## When set to true, the "in template" insertion mode is pushed to the ## stack of template insertion modes on parser start. - DOMBuilderGetDocument*[Handle] = - proc(builder: DOMBuilder[Handle]): Handle {.nimcall.} + DOMBuilderGetDocument*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom]): Handle {.nimcall.} + ## Get the root document node's handle. + ## This must not return nil, not even in the fragment parsing case. + + DOMBuilderGetAtomFactory*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom]): AtomFactory[Atom] {.nimcall.} ## Get the root document node's handle. ## This must not return nil, not even in the fragment parsing case. - DOMBuilderFinish*[Handle] = - proc(builder: DOMBuilder[Handle]) {.nimcall.} + DOMBuilderFinish*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom]) {.nimcall.} ## Parsing has finished. - DOMBuilderParseError*[Handle] = - proc(builder: DOMBuilder[Handle], message: ParseError) {.nimcall.} + DOMBuilderParseError*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], message: ParseError) {.nimcall.} ## Parse error. `message` is an error code either specified by the ## standard (in this case, message < LAST_SPECIFIED_ERROR) or named ## arbitrarily. (At the time of writing, only tokenizer errors have ## specified error codes.) - DOMBuilderSetQuirksMode*[Handle] = - proc(builder: DOMBuilder[Handle], quirksMode: QuirksMode) {.nimcall.} + DOMBuilderSetQuirksMode*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], quirksMode: QuirksMode) {.nimcall.} ## Set quirks mode to either QUIRKS or LIMITED_QUIRKS. NO_QUIRKS ## is the default and is therefore never used here. - DOMBuilderSetEncoding*[Handle] = - proc(builder: DOMBuilder[Handle], encoding: string): SetEncodingResult + DOMBuilderSetEncoding*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], encoding: string): SetEncodingResult {.nimcall.} ## Called whenever a <meta charset=... or a <meta http-equiv=... tag ## containing a non-empty character set is encountered. A @@ -119,38 +125,38 @@ type ## Note that Chame no longer contains any encoding-related logic; this is ## left to the caller. - DOMBuilderElementPopped*[Handle] = - proc(builder: DOMBuilder[Handle], element: Handle) {.nimcall.} + DOMBuilderElementPopped*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], element: Handle) {.nimcall.} ## Called when an element is popped from the stack of open elements ## (i.e. when it has been closed.) - DOMBuilderGetTemplateContent*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): Handle {.nimcall.} + DOMBuilderGetTemplateContent*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], handle: Handle): Handle {.nimcall.} ## Retrieve a handle to the template element's contents. ## Note: this function must never return nil. - DOMBuilderGetParentNode*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): Option[Handle] + DOMBuilderGetParentNode*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], handle: Handle): Option[Handle] {.nimcall.} ## Retrieve a handle to the parent node. ## May return none(Handle) if no parent node exists. - DOMBuilderGetTagType*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): TagType {.nimcall.} + DOMBuilderGetTagType*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], handle: Handle): TagType {.nimcall.} ## Retrieve the tag type of element. - DOMBuilderGetLocalName*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): string {.nimcall.} + DOMBuilderGetLocalName*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], handle: Handle): Atom {.nimcall.} ## Retrieve the local name of element. (This is tagName(getTagType), ## unless the tag is unknown. - DOMBuilderGetNamespace*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): Namespace {.nimcall.} + DOMBuilderGetNamespace*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], handle: Handle): Namespace {.nimcall.} ## Retrieve the namespace of element. - DOMBuilderCreateElement*[Handle] = - proc(builder: DOMBuilder[Handle], localName: string, namespace: Namespace, - tagType: TagType, attrs: Table[string, string]): Handle {.nimcall.} + DOMBuilderCreateElement*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], localName: Atom, namespace: Namespace, + attrs: Table[string, string]): Handle {.nimcall.} ## Create a new element node. ## ## localName is the tag name of the token. @@ -162,25 +168,25 @@ type ## ## attrs is a table of the token's attributes. - DOMBuilderCreateComment*[Handle] = - proc(builder: DOMBuilder[Handle], text: string): Handle {.nimcall.} + DOMBuilderCreateComment*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], text: string): Handle {.nimcall.} ## Create a new comment node. - DOMBuilderInsertText*[Handle] = - proc(builder: DOMBuilder[Handle], parent: Handle, text: string, + DOMBuilderInsertText*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], parent: Handle, text: string, before: Option[Handle]) {.nimcall.} ## Insert a text node at the specified location with contents ## `text`. If the specified location has a previous sibling that is ## a text node, no new text node should be created, but instead `text` ## should be appended to the previous sibling's character data. - DOMBuilderCreateDocumentType*[Handle] = - proc(builder: DOMBuilder[Handle], name, publicId, systemId: string): Handle + DOMBuilderCreateDocumentType*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], name, publicId, systemId: string): Handle {.nimcall.} ## Create a new document type node. - DOMBuilderInsertBefore*[Handle] = - proc(builder: DOMBuilder[Handle], parent, child: Handle, + DOMBuilderInsertBefore*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], parent, child: Handle, before: Option[Handle]) {.nimcall.} ## Insert node `child` before the node called `before`. ## @@ -193,18 +199,18 @@ type ## ## Note: parent may either be an Element or a Document node. - DOMBuilderRemove*[Handle] = - proc(builder: DOMBuilder[Handle], child: Handle) {.nimcall.} + DOMBuilderRemove*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], child: Handle) {.nimcall.} ## Remove `child` from its parent node, and do nothing if `child` ## has no parent node. - DOMBuilderMoveChildren*[Handle] = - proc(builder: DOMBuilder[Handle], fromHandle, toHandle: Handle) {.nimcall.} + DOMBuilderMoveChildren*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], fromHandle, toHandle: Handle) {.nimcall.} ## Remove all children from the node `fromHandle`, then append them to ## `toHandle`. - DOMBuilderAddAttrsIfMissing*[Handle] = - proc(builder: DOMBuilder[Handle], element: Handle, + DOMBuilderAddAttrsIfMissing*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], element: Handle, attrs: Table[string, string]) {.nimcall.} ## Add the attributes in `attrs` to the element node `element`. ## At the time of writing, called for HTML and BODY only. (This may @@ -216,15 +222,15 @@ type ## element.attrs[k] = v ## ``` - DOMBuilderSetScriptAlreadyStarted*[Handle] = - proc(builder: DOMBuilder[Handle], script: Handle) {.nimcall.} + DOMBuilderSetScriptAlreadyStarted*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], script: Handle) {.nimcall.} ## Set the "already started" flag for the script element. ## ## Note: this flag is not togglable, so this callback should just set it ## to true. - DOMBuilderAssociateWithForm*[Handle] = - proc(builder: DOMBuilder[Handle], element, form, intendedParent: Handle) + DOMBuilderAssociateWithForm*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], element, form, intendedParent: Handle) {.nimcall.} ## Called after createElement. Attempts to set form for form-associated ## elements. @@ -232,14 +238,15 @@ type ## Note: the DOM builder is responsible for checking whether the ## intended parent and the form element are in the same tree. - DOMBuilderIsSVGIntegrationPoint*[Handle] = - proc(builder: DOMBuilder[Handle], element: Handle): bool {.nimcall.} + DOMBuilderIsSVGIntegrationPoint*[Handle, Atom] = + proc(builder: DOMBuilder[Handle, Atom], element: Handle): bool {.nimcall.} ## Check if element is an SVG integration point. type - HTML5Parser[Handle] = object + HTML5Parser[Handle, Atom] = object quirksMode: QuirksMode - dombuilder: DOMBuilder[Handle] + dombuilder: DOMBuilder[Handle, Atom] + factory: AtomFactory[Atom] opts: HTML5ParserOpts[Handle] stopped: bool openElements: seq[Handle] @@ -247,15 +254,16 @@ type oldInsertionMode: InsertionMode templateModes: seq[InsertionMode] head: Option[Handle] - tokenizer: Tokenizer + tokenizer: Tokenizer[Atom] form: Option[Handle] fosterParenting: bool # Handle is an element. nil => marker - activeFormatting: seq[(Option[Handle], Token)] + activeFormatting: seq[(Option[Handle], Token[Atom])] framesetok: bool ignoreLF: bool pendingTableChars: string pendingTableCharsWhitespace: bool + caseTable: Table[Atom, Atom] AdjustedInsertionLocation[Handle] = tuple[ inside: Handle, @@ -270,8 +278,17 @@ type AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, AFTER_AFTER_FRAMESET +# AtomFactory interface functions +proc strToAtom[Handle, Atom](parser: HTML5Parser[Handle, Atom], s: string): + Atom = + return parser.factory.strToAtomImpl(parser.factory, s) + +proc tagTypeToAtom[Handle, Atom](parser: HTML5Parser[Handle, Atom], + tagType: TagType): Atom = + return parser.factory.tagTypeToAtomImpl(parser.factory, tagType) + # DOMBuilder interface functions -proc finish[Handle](parser: HTML5Parser[Handle]) = +proc finish[Handle, Atom](parser: HTML5Parser[Handle, Atom]) = if parser.dombuilder.finish != nil: parser.dombuilder.finish(parser.dombuilder) @@ -279,7 +296,7 @@ proc parseError(parser: HTML5Parser, e: ParseError) = if parser.dombuilder.parseError != nil: parser.dombuilder.parseError(parser.dombuilder, e) -proc setQuirksMode[Handle](parser: var HTML5Parser[Handle], mode: QuirksMode) = +proc setQuirksMode[Handle, Atom](parser: var HTML5Parser[Handle, Atom], mode: QuirksMode) = parser.quirksMode = mode if parser.dombuilder.setQuirksMode != nil: parser.dombuilder.setQuirksMode(parser.dombuilder, mode) @@ -290,92 +307,92 @@ proc setEncoding(parser: var HTML5Parser, cs: string): SetEncodingResult = return dombuilder.setEncoding(dombuilder, cs) return SET_ENCODING_CONTINUE -func getDocument[Handle](parser: HTML5Parser[Handle]): Handle = +func getDocument[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = let dombuilder = parser.dombuilder return dombuilder.getDocument(dombuilder) -func getTemplateContent[Handle](parser: HTML5Parser[Handle], +func getTemplateContent[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): Handle = let dombuilder = parser.dombuilder return dombuilder.getTemplateContent(dombuilder, handle) -func getParentNode[Handle](parser: HTML5Parser[Handle], +func getParentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): Option[Handle] = let dombuilder = parser.dombuilder return dombuilder.getParentNode(dombuilder, handle) -func getLocalName[Handle](parser: HTML5Parser[Handle], handle: Handle): - string = +func getLocalName[Handle, Atom](parser: HTML5Parser[Handle, Atom], + handle: Handle): Atom = return parser.dombuilder.getLocalName(parser.dombuilder, handle) -func getTagType[Handle](parser: HTML5Parser[Handle], handle: Handle): TagType = - if parser.dombuilder.getTagType != nil: - return parser.dombuilder.getTagType(parser.dombuilder, handle) - return tagType(parser.getLocalName(handle)) +func getTagType[Handle, Atom](parser: HTML5Parser[Handle, Atom], + handle: Handle): TagType = + return parser.getLocalName(handle).toTagType() -func getNamespace[Handle](parser: HTML5Parser[Handle], handle: Handle): - Namespace = +func getNamespace[Handle, Atom](parser: HTML5Parser[Handle, Atom], + handle: Handle): Namespace = if parser.dombuilder.getNamespace != nil: return parser.dombuilder.getNamespace(parser.dombuilder, handle) return Namespace.HTML -func createElement[Handle](parser: HTML5Parser[Handle], localName: string, - namespace: Namespace, tagType: TagType, attrs: Table[string, string]): +func createElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], + localName: Atom, namespace: Namespace, attrs: Table[string, string]): Handle = return parser.dombuilder.createElement(parser.dombuilder, localName, - namespace, tagType, attrs) + namespace, attrs) -func createElement[Handle](parser: HTML5Parser[Handle], tagType: TagType, - namespace: Namespace): Handle = - return parser.createElement(tagName(tagType), namespace, tagType, - Table[string, string]()) +func createElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], + tagType: TagType, namespace: Namespace): Handle = + let atom = parser.tagTypeToAtom(tagType) + return parser.createElement(atom, namespace, Table[string, string]()) -func createComment[Handle](parser: HTML5Parser[Handle], text: string): Handle = +func createComment[Handle, Atom](parser: HTML5Parser[Handle, Atom], text: string): Handle = let dombuilder = parser.dombuilder return dombuilder.createComment(dombuilder, text) -proc createDocumentType[Handle](parser: HTML5Parser[Handle], name, publicId, +proc createDocumentType[Handle, Atom](parser: HTML5Parser[Handle, Atom], name, publicId, systemId: string): Handle = let dombuilder = parser.dombuilder return dombuilder.createDocumentType(dombuilder, name, publicId, systemId) -proc insertBefore[Handle](parser: HTML5Parser[Handle], parent, node: Handle, +proc insertBefore[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent, node: Handle, before: Option[Handle]) = let dombuilder = parser.dombuilder dombuilder.insertBefore(dombuilder, parent, node, before) -proc insertText[Handle](parser: HTML5Parser[Handle], parent: Handle, +proc insertText[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent: Handle, text: string, before: Option[Handle]) = let dombuilder = parser.dombuilder dombuilder.insertText(dombuilder, parent, text, before) -proc remove[Handle](parser: HTML5Parser[Handle], child: Handle) = +proc remove[Handle, Atom](parser: HTML5Parser[Handle, Atom], child: Handle) = let dombuilder = parser.dombuilder dombuilder.remove(dombuilder, child) -proc moveChildren[Handle](parser: HTML5Parser[Handle], handleFrom, +proc moveChildren[Handle, Atom](parser: HTML5Parser[Handle, Atom], handleFrom, handleTo: Handle) = let dombuilder = parser.dombuilder dombuilder.moveChildren(dombuilder, handleFrom, handleTo) -proc addAttrsIfMissing[Handle](parser: HTML5Parser, element: Handle, - attrs: Table[string, string]) = +proc addAttrsIfMissing[Handle, Atom](parser: HTML5Parser[Handle, Atom], + element: Handle, attrs: Table[string, string]) = let dombuilder = parser.dombuilder if dombuilder.addAttrsIfMissing != nil: dombuilder.addAttrsIfMissing(dombuilder, element, attrs) -proc setScriptAlreadyStarted[Handle](parser: HTML5Parser, script: Handle) = +proc setScriptAlreadyStarted[Handle, Atom](parser: HTML5Parser[Handle, Atom], + script: Handle) = let dombuilder = parser.dombuilder if dombuilder.setScriptAlreadyStarted != nil: dombuilder.setScriptAlreadyStarted(dombuilder, script) -proc associateWithForm[Handle](parser: HTML5Parser, element, form, - intendedParent: Handle) = +proc associateWithForm[Handle, Atom](parser: HTML5Parser[Handle, Atom], + element, form, intendedParent: Handle) = let dombuilder = parser.dombuilder if dombuilder.associateWithForm != nil: dombuilder.associateWithForm(dombuilder, element, form, intendedParent) -func isSVGIntegrationPoint[Handle](parser: HTML5Parser, +func isSVGIntegrationPoint[Handle, Atom](parser: HTML5Parser[Handle, Atom], element: Handle): bool = let dombuilder = parser.dombuilder if dombuilder.isSVGIntegrationPoint != nil: @@ -388,16 +405,10 @@ func hasParseError(parser: HTML5Parser): bool = func tagNameEquals[Handle](parser: HTML5Parser, handle: Handle, token: Token): bool = - let tagType = parser.getTagType(handle) - if tagType != TAG_UNKNOWN: - return tagType == token.tagtype let localName = parser.getLocalName(handle) return localName == token.tagname func tagNameEquals[Handle](parser: HTML5Parser, a, b: Handle): bool = - let tagType = parser.getTagType(a) - if tagType != TAG_UNKNOWN: - return tagType == parser.getTagType(b) return parser.getLocalName(a) == parser.getLocalName(b) func fragment(parser: HTML5Parser): bool = @@ -449,16 +460,16 @@ proc resetInsertionMode(parser: var HTML5Parser) = if last: switch_insertion_mode_and_return IN_BODY -func currentNode[Handle](parser: HTML5Parser[Handle]): Handle = +func currentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = return parser.openElements[^1] -func adjustedCurrentNode[Handle](parser: HTML5Parser[Handle]): Handle = +func adjustedCurrentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = if parser.fragment: parser.opts.ctx.get else: parser.currentNode -func lastElementOfTag[Handle](parser: HTML5Parser[Handle], +func lastElementOfTag[Handle, Atom](parser: HTML5Parser[Handle, Atom], tagType: TagType): tuple[element: Option[Handle], pos: int] = for i in countdown(parser.openElements.high, 0): if parser.getTagType(parser.openElements[i]) == tagType: @@ -469,7 +480,7 @@ func last_child_of[Handle](n: Handle): AdjustedInsertionLocation[Handle] = (n, none(Handle)) # https://html.spec.whatwg.org/multipage/#appropriate-place-for-inserting-a-node -func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle], +func appropriatePlaceForInsert[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle): AdjustedInsertionLocation[Handle] = assert parser.getTagType(parser.openElements[0]) == TAG_HTML let targetTagType = parser.getTagType(target) @@ -495,23 +506,23 @@ func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle], parser.dombuilder.getTemplateContent != nil: result = (parser.getTemplateContent(result.inside), none(Handle)) -func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle]): +func appropriatePlaceForInsert[Handle, Atom](parser: HTML5Parser[Handle, Atom]): AdjustedInsertionLocation[Handle] = parser.appropriatePlaceForInsert(parser.currentNode) -func hasElement[Handle](parser: HTML5Parser[Handle], tag: TagType): bool = +func hasElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], tag: TagType): bool = for element in parser.openElements: if parser.getTagType(element) == tag: return true return false -func hasElement[Handle](parser: HTML5Parser[Handle], tags: set[TagType]): bool = +func hasElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], tags: set[TagType]): bool = for element in parser.openElements: if parser.getTagType(element) in tags: return true return false -func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], +func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): if parser.openElements[i] == target: @@ -520,7 +531,7 @@ func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], return false assert false -func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], +func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i]) @@ -530,7 +541,7 @@ func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], return false assert false -func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], +func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType], list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i]) @@ -546,38 +557,38 @@ const Scope = { # Note: MathML is not implemented } -func hasElementInScope[Handle](parser: HTML5Parser[Handle], +func hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = return parser.hasElementInSpecificScope(target, Scope) -func hasElementInScope[Handle](parser: HTML5Parser[Handle], +func hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType]): bool = return parser.hasElementInSpecificScope(target, Scope) -func hasElementInScope[Handle](parser: HTML5Parser[Handle], +func hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle): bool = return parser.hasElementInSpecificScope(target, Scope) -func hasElementInListItemScope[Handle](parser: HTML5Parser[Handle], +func hasElementInListItemScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = const ListItemScope = Scope + {TAG_OL, TAG_UL} return parser.hasElementInSpecificScope(target, ListItemScope) -func hasElementInButtonScope[Handle](parser: HTML5Parser[Handle], +func hasElementInButtonScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = const ButtonScope = Scope + {TAG_BUTTON} return parser.hasElementInSpecificScope(target, ButtonScope) const TableScope = {TAG_HTML, TAG_TABLE, TAG_TEMPLATE} -func hasElementInTableScope[Handle](parser: HTML5Parser[Handle], +func hasElementInTableScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = return parser.hasElementInSpecificScope(target, TableScope) -func hasElementInTableScope[Handle](parser: HTML5Parser[Handle], +func hasElementInTableScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType]): bool = return parser.hasElementInSpecificScope(target, TableScope) -func hasElementInSelectScope[Handle](parser: HTML5Parser[Handle], +func hasElementInSelectScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i]) @@ -587,24 +598,22 @@ func hasElementInSelectScope[Handle](parser: HTML5Parser[Handle], return false assert false -func createElement[Handle](parser: HTML5Parser[Handle], token: Token, +func createElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], token: Token, namespace: Namespace, intendedParent: Handle): Handle = #TODO custom elements - let localName = token.tagname - let element = parser.createElement(localName, namespace, token.tagtype, - token.attrs) + let element = parser.createElement(token.tagname, namespace, token.attrs) if token.tagtype in FormAssociatedElements and parser.form.isSome and not parser.hasElement(TAG_TEMPLATE) and (token.tagtype notin ListedElements or "form" notin token.attrs): parser.associateWithForm(element, parser.form.get, intendedParent) return element -proc pushElement[Handle](parser: var HTML5Parser[Handle], node: Handle) = +proc pushElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], node: Handle) = parser.openElements.add(node) let node = parser.adjustedCurrentNode() parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML -proc popElement[Handle](parser: var HTML5Parser[Handle]): Handle = +proc popElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom]): Handle = result = parser.openElements.pop() if parser.dombuilder.elementPopped != nil: parser.dombuilder.elementPopped(parser.dombuilder, result) @@ -616,14 +625,14 @@ proc popElement[Handle](parser: var HTML5Parser[Handle]): Handle = template pop_current_node = discard parser.popElement() -proc insert[Handle](parser: HTML5Parser[Handle], +proc insert[Handle, Atom](parser: HTML5Parser[Handle, Atom], location: AdjustedInsertionLocation[Handle], node: Handle) = parser.insertBefore(location.inside, node, location.before) -proc append[Handle](parser: HTML5Parser[Handle], parent, node: Handle) = +proc append[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent, node: Handle) = parser.insertBefore(parent, node, none(Handle)) -proc insertForeignElement[Handle](parser: var HTML5Parser[Handle], token: Token, +proc insertForeignElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token, namespace: Namespace): Handle = let location = parser.appropriatePlaceForInsert() let element = parser.createElement(token, namespace, location.inside) @@ -632,7 +641,7 @@ proc insertForeignElement[Handle](parser: var HTML5Parser[Handle], token: Token, parser.pushElement(element) return element -proc insertHTMLElement[Handle](parser: var HTML5Parser[Handle], +proc insertHTMLElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token): Handle = return parser.insertForeignElement(token, Namespace.HTML) @@ -710,7 +719,7 @@ proc insertCharacter(parser: var HTML5Parser, data: string) = return insertText(parser, location.inside, $data, location.before) -proc insertComment[Handle](parser: var HTML5Parser[Handle], token: Token, +proc insertComment[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token, position: AdjustedInsertionLocation[Handle]) = let comment = parser.createComment(token.data) parser.insert(position, comment) @@ -850,7 +859,7 @@ proc popElementsIncl(parser: var HTML5Parser, tags: set[TagType]) = discard # Pop all elements, including the specified element. -proc popElementsIncl[Handle](parser: var HTML5Parser[Handle], handle: Handle) = +proc popElementsIncl[Handle, Atom](parser: var HTML5Parser[Handle, Atom], handle: Handle) = while parser.popElement() != handle: discard @@ -878,7 +887,7 @@ proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = discard parser.popElement() # https://html.spec.whatwg.org/multipage/parsing.html#push-onto-the-list-of-active-formatting-elements -proc pushOntoActiveFormatting[Handle](parser: var HTML5Parser[Handle], +proc pushOntoActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], element: Handle, token: Token) = var count = 0 for i in countdown(parser.activeFormatting.high, 0): @@ -900,7 +909,7 @@ proc pushOntoActiveFormatting[Handle](parser: var HTML5Parser[Handle], proc tostr(ftype: enum): string = return ($ftype).split('_')[1..^1].join("-").toLowerAscii() -func handle2str[Handle](parser: HTML5Parser[Handle], node: Handle): string = +func handle2str[Handle, Atom](parser: HTML5Parser[Handle, Atom], node: Handle): string = case node.nodeType of ELEMENT_NODE: let tt = parser.getTagType(node) @@ -915,7 +924,7 @@ func handle2str[Handle](parser: HTML5Parser[Handle], node: Handle): string = else: result = "Node of " & $node.nodeType -proc dumpDocument[Handle](parser: var HTML5Parser[Handle]) = +proc dumpDocument[Handle, Atom](parser: var HTML5Parser[Handle, Atom]) = let document = parser.getDocument() var s = "" for x in document.childList: @@ -923,7 +932,7 @@ proc dumpDocument[Handle](parser: var HTML5Parser[Handle]) = echo s ]# -proc reconstructActiveFormatting[Handle](parser: var HTML5Parser[Handle]) = +proc reconstructActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom]) = type State = enum REWIND, ADVANCE, CREATE if parser.activeFormatting.len == 0 or @@ -961,7 +970,7 @@ proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = parser.activeFormatting.pop()[0].isSome: discard -func isHTMLIntegrationPoint[Handle](parser: HTML5Parser[Handle], +func isHTMLIntegrationPoint[Handle, Atom](parser: HTML5Parser[Handle, Atom], element: Handle): bool = return parser.isSVGIntegrationPoint(element) # (NOTE MathML not implemented) @@ -1016,7 +1025,7 @@ proc parseErrorByTokenType(parser: var HTML5Parser, tokenType: TokenType) = doAssert false # Find a node in the list of active formatting elements, or return -1. -func findLastActiveFormatting[Handle](parser: var HTML5Parser[Handle], +func findLastActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], node: Handle): int = for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i][0] @@ -1063,7 +1072,7 @@ func findFurthestBlockAfter(parser: var HTML5Parser, stackIndex: int): int = return i return -1 -func findLastActiveFormatting[Handle](parser: var HTML5Parser[Handle], +func findLastActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], tagTypes: set[TagType]): int = for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i][0] @@ -1072,7 +1081,7 @@ func findLastActiveFormatting[Handle](parser: var HTML5Parser[Handle], return -1 # If true is returned, call "any other end tag". -proc adoptionAgencyAlgorithm[Handle](parser: var HTML5Parser[Handle], +proc adoptionAgencyAlgorithm[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token): bool = template parse_error(e: ParseError) = parser.parseError(e) @@ -1161,6 +1170,10 @@ proc closeP(parser: var HTML5Parser) = while parser.getTagType(parser.popElement()) != TAG_P: discard +proc newStartTagToken[Handle, Atom](parser: HTML5Parser[Handle, Atom], + t: TagType): Token[Atom] = + return Token[Atom](t: START_TAG, tagname: parser.tagTypeToAtom(t)) + # Following is an implementation of the state (?) machine defined in # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml # It uses the ad-hoc pattern matching macro `match' to apply the following @@ -1259,29 +1272,18 @@ macro match(token: Token, body: typed): untyped = assert s[i] in AsciiAlphaNumeric tagName &= s[i] inc i - let token = if s[1] == '/': - Token( - t: END_TAG, - tagname: tagName, - tagtype: tagType(tagName) - ) - else: - Token( - t: START_TAG, - tagname: tagName, - tagtype: tagType(tagName) - ) - let tt = int(token.tagtype) + let tt = int(tagType(tagName)) + let tokt = if s[1] != '/': START_TAG else: END_TAG var found = false - for i in 0..ofBranches[token.t].ofBranches.high: - if ofBranches[token.t].ofBranches[i][1] == action: + for i in 0..ofBranches[tokt].ofBranches.high: + if ofBranches[tokt].ofBranches[i][1] == action: found = true - ofBranches[token.t].ofBranches[i][0].add((quote do: TagType(`tt`))) - ofBranches[token.t].painted = true + ofBranches[tokt].ofBranches[i][0].add((quote do: TagType(`tt`))) + ofBranches[tokt].painted = true break if not found: - ofBranches[token.t].ofBranches.add((@[(quote do: TagType(`tt`))], action)) - ofBranches[token.t].painted = true + ofBranches[tokt].ofBranches.add((@[(quote do: TagType(`tt`))], action)) + ofBranches[tokt].painted = true of nnkDiscardStmt: defaultBranch = action of nnkTupleConstr: @@ -1343,7 +1345,7 @@ macro match(token: Token, body: typed): untyped = stmts.add(stmt) result = newBlockStmt(ident("inside_not_else"), stmts) -proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], +proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token, insertionMode: InsertionMode) = template anything_else = discard "anything_else" @@ -1436,7 +1438,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) other => (block: - let head = Token(t: START_TAG, tagtype: TAG_HEAD) + let head = parser.newStartTagToken(TAG_HEAD) parser.head = some(parser.insertHTMLElement(head)) parser.insertionMode = IN_HEAD reprocess token @@ -1577,7 +1579,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ("<head>") => (block: parse_error UNEXPECTED_START_TAG) (TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) other => (block: - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY)) + discard parser.insertHTMLElement(parser.newStartTagToken(TAG_BODY)) parser.insertionMode = IN_BODY reprocess token ) @@ -1805,7 +1807,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], "</p>" => (block: if not parser.hasElementInButtonScope(TAG_P): parse_error ELEMENT_NOT_IN_SCOPE - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P)) + discard parser.insertHTMLElement(parser.newStartTagToken(TAG_P)) parser.closeP() ) "</li>" => (block: @@ -1903,7 +1905,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "</br>" => (block: parse_error UNEXPECTED_END_TAG - reprocess Token(t: START_TAG, tagtype: TAG_BR) + reprocess parser.newStartTagToken(TAG_BR) ) ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: parser.reconstructActiveFormatting() @@ -1931,7 +1933,12 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "<image>" => (block: #TODO ew - let token = Token(t: START_TAG, tagtype: TAG_IMG, tagname: "img", selfclosing: token.selfclosing, attrs: token.attrs) + let token = Token( + t: START_TAG, + tagname: parser.tagTypeToAtom(TAG_IMG), + selfclosing: token.selfclosing, + attrs: token.attrs + ) reprocess token ) "<textarea>" => (block: @@ -2059,13 +2066,13 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "<colgroup>" => (block: clear_the_stack_back_to_a_table_context - let colgroupTok = Token(t: START_TAG, tagtype: TAG_COLGROUP) + let colgroupTok = parser.newStartTagToken(TAG_COLGROUP) discard parser.insertHTMLElement(colgroupTok) parser.insertionMode = IN_COLUMN_GROUP ) "<col>" => (block: clear_the_stack_back_to_a_table_context - let colgroupTok = Token(t: START_TAG, tagtype: TAG_COLGROUP) + let colgroupTok = parser.newStartTagToken(TAG_COLGROUP) discard parser.insertHTMLElement(colgroupTok) parser.insertionMode = IN_COLUMN_GROUP reprocess token @@ -2077,7 +2084,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("<td>", "<th>", "<tr>") => (block: clear_the_stack_back_to_a_table_context - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TBODY)) + discard parser.insertHTMLElement(parser.newStartTagToken(TAG_TBODY)) parser.insertionMode = IN_TABLE_BODY reprocess token ) @@ -2236,7 +2243,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ("<th>", "<td>") => (block: parse_error UNEXPECTED_START_TAG clear_the_stack_back_to_a_table_body_context - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR)) + discard parser.insertHTMLElement(parser.newStartTagToken(TAG_TR)) parser.insertionMode = IN_ROW reprocess token ) @@ -2579,46 +2586,6 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) other => (block: parser.parseErrorByTokenType(token.t)) -const CaseTable = { - "altglyph": "altGlyph", - "altglyphdef": "altGlyphDef", - "altglyphitem": "altGlyphItem", - "animatecolor": "animateColor", - "animatemotion": "animateMotion", - "animatetransform": "animateTransform", - "clippath": "clipPath", - "feblend": "feBlend", - "fecolormatrix": "feColorMatrix", - "fecomponenttransfer": "feComponentTransfer", - "fecomposite": "feComposite", - "feconvolvematrix": "feConvolveMatrix", - "fediffuselighting": "feDiffuseLighting", - "fedisplacementmap": "feDisplacementMap", - "fedistantlight": "feDistantLight", - "fedropshadow": "feDropShadow", - "feflood": "feFlood", - "fefunca": "feFuncA", - "fefuncb": "feFuncB", - "fefuncg": "feFuncG", - "fefuncr": "feFuncR", - "fegaussianblur": "feGaussianBlur", - "feimage": "feImage", - "femerge": "feMerge", - "femergenode": "feMergeNode", - "femorphology": "feMorphology", - "feoffset": "feOffset", - "fepointlight": "fePointLight", - "fespecularlighting": "feSpecularLighting", - "fespotlight": "feSpotLight", - "fetile": "feTile", - "feturbulence": "feTurbulence", - "foreignobject": "foreignObject", - "glyphref": "glyphRef", - "lineargradient": "linearGradient", - "radialgradient": "radialGradient", - "textpath": "textPath", -}.toTable() - proc processInForeignContent(parser: var HTML5Parser, token: Token) = macro `=>`(v: typed, body: untyped): untyped = quote do: @@ -2672,8 +2639,8 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) = #NOTE MathML not implemented let namespace = parser.getNamespace(parser.adjustedCurrentNode) if namespace == Namespace.SVG: - if token.tagname in CaseTable: - token.tagname = CaseTable[token.tagname] + parser.caseTable.withValue(token.tagname, p): + token.tagname = p[] adjustSVGAttributes(token) #TODO adjust foreign attributes discard parser.insertForeignElement(token, namespace) @@ -2684,15 +2651,17 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) = ) "</script>" => (block: let namespace = parser.getNamespace(parser.currentNode) - let localName = parser.getLocalName(parser.currentNode) - if namespace == Namespace.SVG and localName == "script": #TODO SVG + let localName = parser.getTagType(parser.currentNode) + # Any atom corresponding to the string "script" must have the same + # value as TAG_SCRIPT, so this is correct. + if namespace == Namespace.SVG and localName == TAG_SCRIPT: #TODO SVG script_end_tag else: any_other_end_tag ) TokenType.END_TAG => (block: any_other_end_tag) -proc constructTree[Handle](parser: var HTML5Parser[Handle]) = +proc constructTree[Handle, Atom](parser: var HTML5Parser[Handle, Atom]) = for token in parser.tokenizer.tokenize: if parser.ignoreLF: parser.ignoreLF = false @@ -2724,6 +2693,7 @@ proc finishParsing(parser: var HTML5Parser) = # Any of these pointers being nil would later result in a crash. proc checkCallbacks(dombuilder: DOMBuilder) = doAssert dombuilder.getDocument != nil + doAssert dombuilder.getAtomFactory != nil doAssert dombuilder.getParentNode != nil doAssert dombuilder.getLocalName != nil doAssert dombuilder.createElement != nil @@ -2734,19 +2704,72 @@ proc checkCallbacks(dombuilder: DOMBuilder) = doAssert dombuilder.remove != nil doAssert dombuilder.moveChildren != nil -proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle], - opts: HTML5ParserOpts[Handle]) = +proc checkCallbacks(factory: AtomFactory) = + doAssert factory.strToAtomImpl != nil + doAssert factory.tagTypeToAtomImpl != nil + +const CaseTable = { + "altglyph": "altGlyph", + "altglyphdef": "altGlyphDef", + "altglyphitem": "altGlyphItem", + "animatecolor": "animateColor", + "animatemotion": "animateMotion", + "animatetransform": "animateTransform", + "clippath": "clipPath", + "feblend": "feBlend", + "fecolormatrix": "feColorMatrix", + "fecomponenttransfer": "feComponentTransfer", + "fecomposite": "feComposite", + "feconvolvematrix": "feConvolveMatrix", + "fediffuselighting": "feDiffuseLighting", + "fedisplacementmap": "feDisplacementMap", + "fedistantlight": "feDistantLight", + "fedropshadow": "feDropShadow", + "feflood": "feFlood", + "fefunca": "feFuncA", + "fefuncb": "feFuncB", + "fefuncg": "feFuncG", + "fefuncr": "feFuncR", + "fegaussianblur": "feGaussianBlur", + "feimage": "feImage", + "femerge": "feMerge", + "femergenode": "feMergeNode", + "femorphology": "feMorphology", + "feoffset": "feOffset", + "fepointlight": "fePointLight", + "fespecularlighting": "feSpecularLighting", + "fespotlight": "feSpotLight", + "fetile": "feTile", + "feturbulence": "feTurbulence", + "foreignobject": "foreignObject", + "glyphref": "glyphRef", + "lineargradient": "linearGradient", + "radialgradient": "radialGradient", + "textpath": "textPath", +} + +proc createCaseTable[Handle, Atom](parser: var HTML5Parser[Handle, Atom]) = + for (k, v) in CaseTable: + let ka = parser.strToAtom(k) + let va = parser.strToAtom(v) + parser.caseTable[ka] = va + +proc parseHTML*[Handle, Atom](inputStream: Stream, + dombuilder: DOMBuilder[Handle, Atom], opts: HTML5ParserOpts[Handle]) = ## Parse an HTML document, using the DOMBuilder object `dombuilder`, and ## parser options `opts`. dombuilder.checkCallbacks() let tokstate = opts.initialTokenizerState - var parser = HTML5Parser[Handle]( + let factory = dombuilder.getAtomFactory(dombuilder) + var parser = HTML5Parser[Handle, Atom]( dombuilder: dombuilder, + factory: factory, opts: opts, openElements: opts.openElementsInit, form: opts.formInit, framesetOk: true ) + parser.createCaseTable() if opts.openElementsInit.len > 0: parser.resetInsertionMode() if opts.pushInTemplate: @@ -2757,6 +2780,11 @@ proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle], x else: nil - parser.tokenizer = newTokenizer(inputStream, onParseError, tokstate) + parser.tokenizer = newTokenizer[Atom]( + inputStream, + onParseError, + factory, + tokstate + ) parser.constructTree() parser.finishParsing() diff --git a/chame/htmltokenizer.nim b/chame/htmltokenizer.nim index 8526ff87..bfb0da02 100644 --- a/chame/htmltokenizer.nim +++ b/chame/htmltokenizer.nim @@ -8,6 +8,7 @@ import std/strutils import std/tables import std/unicode +import atoms import entity import parseerror import tags @@ -21,22 +22,23 @@ const bufLen = 4096 const copyBufLen = 64 type - Tokenizer* = object + Tokenizer*[Atom] = object + factory: AtomFactory[Atom] state*: TokenizerState rstate: TokenizerState tmp: string code: uint32 - tok: Token - laststart*: Token + tok: Token[Atom] + laststart*: Token[Atom] attrn: string attrv: string attr: bool hasnonhtml*: bool onParseError: proc(e: ParseError) - tokqueue: seq[Token] + tokqueue: seq[Token[Atom]] charbuf: string isws: bool - peekBuf: string + tagNameBuf: string stream: Stream sbuf: array[bufLen, char] @@ -82,7 +84,7 @@ type DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE, DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END - Token* = ref object + Token*[Atom] = ref object case t*: TokenType of DOCTYPE: quirks*: bool @@ -91,8 +93,7 @@ type sysid*: Option[string] of START_TAG, END_TAG: selfclosing*: bool - tagname*: string - tagtype*: TagType + tagname*: Atom attrs*: Table[string, string] of CHARACTER, CHARACTER_WHITESPACE: s*: string @@ -109,6 +110,9 @@ func `$`*(tok: Token): string = of COMMENT: fmt"{tok.t} {tok.data}" of EOF: fmt"{tok.t}" +func tagtype*(tok: Token): TagType = + return tok.tagname.toTagType() + const hexCharMap = (func(): array[char, uint32] = for i in 0u32..255u32: case chr(i) @@ -138,14 +142,18 @@ proc readn(t: var Tokenizer) = if t.stream.atEnd: t.eof_i = t.sbufLen -proc newTokenizer*(s: Stream, onParseError: proc(e: ParseError), - initialState = DATA): Tokenizer = - var t = Tokenizer( +proc strToAtom[Atom](tokenizer: Tokenizer[Atom], s: string): Atom = + return tokenizer.factory.strToAtomImpl(tokenizer.factory, s) + +proc newTokenizer*[Atom](s: Stream, onParseError: proc(e: ParseError), + factory: AtomFactory[Atom], initialState = DATA): Tokenizer[Atom] = + var t = Tokenizer[Atom]( stream: s, eof_i: -1, sbuf_i: 0, onParseError: onParseError, - state: initialState + state: initialState, + factory: factory ) t.readn() return t @@ -179,12 +187,12 @@ proc consume(t: var Tokenizer): char = proc reconsume(t: var Tokenizer) = dec t.sbuf_i -proc flushChars(tokenizer: var Tokenizer) = +proc flushChars[Atom](tokenizer: var Tokenizer[Atom]) = if tokenizer.charbuf.len > 0: let token = if not tokenizer.isws: - Token(t: CHARACTER, s: tokenizer.charbuf) + Token[Atom](t: CHARACTER, s: tokenizer.charbuf) else: - Token(t: CHARACTER_WHITESPACE, s: tokenizer.charbuf) + Token[Atom](t: CHARACTER_WHITESPACE, s: tokenizer.charbuf) tokenizer.tokqueue.add(token) tokenizer.isws = false tokenizer.charbuf.setLen(0) @@ -254,7 +262,7 @@ proc numericCharacterReferenceEndState(tokenizer: var Tokenizer) = for c in s: tokenizer.emit(c) -iterator tokenize*(tokenizer: var Tokenizer): Token = +iterator tokenize*[Atom](tokenizer: var Tokenizer[Atom]): Token[Atom] = var running = true template emit(tok: Token) = @@ -262,9 +270,9 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = if tok.t == START_TAG: tokenizer.laststart = tok if tok.t in {START_TAG, END_TAG}: - tok.tagtype = tagType(tok.tagname) + tok.tagname = tokenizer.strToAtom(tokenizer.tagNameBuf) tokenizer.tokqueue.add(tok) - template emit(tok: TokenType) = emit Token(t: tok) + template emit(tok: TokenType) = emit Token[Atom](t: tok) template emit(s: static string) = static: doAssert AsciiWhitespace notin s @@ -275,7 +283,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.emit(ch) template emit_null = tokenizer.flushChars() - emit Token(t: CHARACTER_NULL) + emit Token[Atom](t: CHARACTER_NULL) template emit_eof = emit EOF running = false @@ -293,8 +301,9 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = template parse_error(error: untyped) = tokenizer.parseError(error) template is_appropriate_end_tag_token(): bool = + #TODO this unnecessarily hashes twice tokenizer.laststart != nil and - tokenizer.laststart.tagname == tokenizer.tok.tagname + tokenizer.laststart.tagname == tokenizer.strToAtom(tokenizer.tagNameBuf) template start_new_attribute = if tokenizer.tok.t == START_TAG and tokenizer.attr: tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv @@ -358,43 +367,6 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.attr = false tokenizer.tok = t - # Fake EOF as an actual character. Also replace anything_else with the else - # branch. - macro stateMachine(states: varargs[untyped]): untyped = - var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state) - for state in states: - if state.kind == nnkOfBranch: - var mainstmtlist: NimNode - var mainstmtlist_i = -1 - for i in 0 ..< state.len: - if state[i].kind == nnkStmtList: - mainstmtlist = state[i] - mainstmtlist_i = i - break - var hasanythingelse = false - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else": - hasanythingelse = true - - let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt) - var elsestmts: NimNode - - for i in countdown(childcase.len-1, 0): - let childof = childcase[i] - if childof.kind == nnkElse: - elsestmts = childof.findChild(it.kind == nnkStmtList) - - if hasanythingelse: - let fake_anything_else = quote do: - template anything_else = - `elsestmts` - mainstmtlist.insert(0, fake_anything_else) - state[mainstmtlist_i] = mainstmtlist - maincase.add(state) - result = newNimNode(nnkStmtList) - result.add(maincase) - - template has_anything_else = discard # does nothing - const null = char(0) while not tokenizer.atEof: @@ -403,7 +375,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state s let c = tokenizer.consume() - stateMachine: # => case tokenizer.state + case tokenizer.state of DATA: case c of '&': switch_state_return CHARACTER_REFERENCE @@ -450,11 +422,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '!': switch_state MARKUP_DECLARATION_OPEN of '/': switch_state END_TAG_OPEN of AsciiAlpha: - new_token Token(t: START_TAG) + new_token Token[Atom](t: START_TAG) + tokenizer.tagNameBuf = "" reconsume_in TAG_NAME of '?': parse_error UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME - new_token Token(t: COMMENT) + new_token Token[Atom](t: COMMENT) reconsume_in BOGUS_COMMENT else: parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME @@ -464,14 +437,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of END_TAG_OPEN: case c of AsciiAlpha: - new_token Token(t: END_TAG) + new_token Token[Atom](t: END_TAG) + tokenizer.tagNameBuf = "" reconsume_in TAG_NAME of '>': parse_error MISSING_END_TAG_NAME switch_state DATA else: parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME - new_token Token(t: COMMENT) + new_token Token[Atom](t: COMMENT) reconsume_in BOGUS_COMMENT of TAG_NAME: @@ -481,11 +455,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '>': switch_state DATA emit_tok - of AsciiUpperAlpha: tokenizer.tok.tagname &= c.toLowerAscii() + of AsciiUpperAlpha: tokenizer.tagNameBuf &= c.toLowerAscii() of null: parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.tagname &= "\uFFFD" - else: tokenizer.tok.tagname &= c + tokenizer.tagNameBuf &= "\uFFFD" + else: tokenizer.tagNameBuf &= c of RCDATA_LESS_THAN_SIGN: case c @@ -499,14 +473,19 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of RCDATA_END_TAG_OPEN: case c of AsciiAlpha: - new_token Token(t: END_TAG) + new_token Token[Atom](t: END_TAG) + tokenizer.tagNameBuf = "" reconsume_in RCDATA_END_TAG_NAME else: emit "</" reconsume_in RCDATA of RCDATA_END_TAG_NAME: - has_anything_else + template anything_else = + new_token nil #TODO + emit "</" + emit_tmp + reconsume_in RCDATA case c of AsciiWhitespace: if is_appropriate_end_tag_token: @@ -525,13 +504,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= c.toLowerAscii() + tokenizer.tagNameBuf &= c.toLowerAscii() tokenizer.tmp &= c else: - new_token nil #TODO - emit "</" - emit_tmp - reconsume_in RCDATA + anything_else of RAWTEXT_LESS_THAN_SIGN: case c @@ -545,14 +521,19 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of RAWTEXT_END_TAG_OPEN: case c of AsciiAlpha: - new_token Token(t: END_TAG) + new_token Token[Atom](t: END_TAG) + tokenizer.tagNameBuf = "" reconsume_in RAWTEXT_END_TAG_NAME else: emit "</" reconsume_in RAWTEXT of RAWTEXT_END_TAG_NAME: - has_anything_else + template anything_else = + new_token nil #TODO + emit "</" + emit_tmp + reconsume_in RAWTEXT case c of AsciiWhitespace: if is_appropriate_end_tag_token: @@ -571,13 +552,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= c.toLowerAscii() + tokenizer.tagNameBuf &= c.toLowerAscii() tokenizer.tmp &= c else: - new_token nil #TODO - emit "</" - emit_tmp - reconsume_in RAWTEXT + anything_else of SCRIPT_DATA_LESS_THAN_SIGN: case c @@ -594,14 +572,18 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_END_TAG_OPEN: case c of AsciiAlpha: - new_token Token(t: END_TAG) + new_token Token[Atom](t: END_TAG) + tokenizer.tagNameBuf = "" reconsume_in SCRIPT_DATA_END_TAG_NAME else: emit "</" reconsume_in SCRIPT_DATA of SCRIPT_DATA_END_TAG_NAME: - has_anything_else + template anything_else = + emit "</" + emit_tmp + reconsume_in SCRIPT_DATA case c of AsciiWhitespace: if is_appropriate_end_tag_token: @@ -620,12 +602,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= c.toLowerAscii() + tokenizer.tagNameBuf &= c.toLowerAscii() tokenizer.tmp &= c else: - emit "</" - emit_tmp - reconsume_in SCRIPT_DATA + anything_else of SCRIPT_DATA_ESCAPE_START: case c @@ -704,14 +684,18 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_ESCAPED_END_TAG_OPEN: case c of AsciiAlpha: - new_token Token(t: END_TAG) + new_token Token[Atom](t: END_TAG) + tokenizer.tagNameBuf = "" reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME else: emit "</" reconsume_in SCRIPT_DATA_ESCAPED of SCRIPT_DATA_ESCAPED_END_TAG_NAME: - has_anything_else + template anything_else = + emit "</" + emit_tmp + reconsume_in SCRIPT_DATA_ESCAPED case c of AsciiWhitespace: if is_appropriate_end_tag_token: @@ -730,12 +714,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: - tokenizer.tok.tagname &= c.toLowerAscii() + tokenizer.tagNameBuf &= c.toLowerAscii() tokenizer.tmp &= c else: - emit "</" - emit_tmp - reconsume_in SCRIPT_DATA_ESCAPED + anything_else of SCRIPT_DATA_DOUBLE_ESCAPE_START: case c @@ -832,7 +814,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = reconsume_in ATTRIBUTE_NAME of ATTRIBUTE_NAME: - has_anything_else + template anything_else = + tokenizer.attrn &= c case c of AsciiWhitespace, '/', '>': leave_attribute_name_state @@ -849,7 +832,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = parse_error UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME anything_else else: - tokenizer.attrn &= c + anything_else of AFTER_ATTRIBUTE_NAME: case c @@ -942,11 +925,14 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: tokenizer.tok.data &= c of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway - has_anything_else + template anything_else = + parse_error INCORRECTLY_OPENED_COMMENT + new_token Token[Atom](t: COMMENT) + reconsume_in BOGUS_COMMENT case c of '-': if not tokenizer.atEof and peek_char == '-': - new_token Token(t: COMMENT) + new_token Token[Atom](t: COMMENT) tokenizer.state = COMMENT_START consume_and_discard 1 else: anything_else @@ -962,13 +948,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state CDATA_SECTION else: parse_error CDATA_IN_HTML_CONTENT - new_token Token(t: COMMENT, data: "[CDATA[") + new_token Token[Atom](t: COMMENT, data: "[CDATA[") switch_state BOGUS_COMMENT else: anything_else else: - parse_error INCORRECTLY_OPENED_COMMENT - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT + anything_else of COMMENT_START: case c @@ -1069,19 +1053,19 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiWhitespace: discard of AsciiUpperAlpha: - new_token Token(t: DOCTYPE, name: some($c.toLowerAscii())) + new_token Token[Atom](t: DOCTYPE, name: some($c.toLowerAscii())) switch_state DOCTYPE_NAME of null: parse_error UNEXPECTED_NULL_CHARACTER - new_token Token(t: DOCTYPE, name: some($"\uFFFD")) + new_token Token[Atom](t: DOCTYPE, name: some($"\uFFFD")) switch_state DOCTYPE_NAME of '>': parse_error MISSING_DOCTYPE_NAME - new_token Token(t: DOCTYPE, quirks: true) + new_token Token[Atom](t: DOCTYPE, quirks: true) switch_state DATA emit_tok else: - new_token Token(t: DOCTYPE, name: some($c)) + new_token Token[Atom](t: DOCTYPE, name: some($c)) switch_state DOCTYPE_NAME of DOCTYPE_NAME: @@ -1099,7 +1083,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.name.get &= c of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway - has_anything_else + template anything_else = + parse_error INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE case c of AsciiWhitespace: discard of '>': @@ -1118,9 +1105,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else else: - parse_error INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE + anything_else of AFTER_DOCTYPE_PUBLIC_KEYWORD: case c @@ -1573,7 +1558,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_eof of MARKUP_DECLARATION_OPEN: parse_error INCORRECTLY_OPENED_COMMENT - new_token Token(t: COMMENT) + new_token Token[Atom](t: COMMENT) reconsume_in BOGUS_COMMENT of COMMENT_START: reconsume_in COMMENT @@ -1593,7 +1578,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_eof of DOCTYPE, BEFORE_DOCTYPE_NAME: parse_error EOF_IN_DOCTYPE - new_token Token(t: DOCTYPE, quirks: true) + new_token Token[Atom](t: DOCTYPE, quirks: true) emit_tok emit_eof of DOCTYPE_NAME, AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD, diff --git a/chame/minidom.nim b/chame/minidom.nim index e6c5478f..537046b5 100644 --- a/chame/minidom.nim +++ b/chame/minidom.nim @@ -8,16 +8,85 @@ ## ## For a variant that can switch encodings when meta tags are encountered etc. ## see `chame/minidom_cs <minidom.html>`. + import std/streams import std/tables import std/options +import std/hashes +import atoms import htmlparser import htmltokenizer import tags export tags +# Atom implementation +#TODO maybe we should use a better hash map. +const MAtomFactoryStrMapLength = 1024 # must be a power of 2 +static: + doAssert (MAtomFactoryStrMapLength and (MAtomFactoryStrMapLength - 1)) == 0 + +type + MAtom* = distinct int + + MAtomFactory* = ref object of AtomFactory[MAtom] + strMap: array[MAtomFactoryStrMapLength, seq[MAtom]] + atomMap: seq[string] + +# Mandatory Atom functions +func `==`*(a, b: MAtom): bool {.borrow.} +func toTagType*(atom: MAtom): TagType {.inline.} = + #TODO should probably get AtomFactory too... + if int(atom) <= int(high(TagType)): + return TagType(atom) + return TAG_UNKNOWN +func cmp*(a, b: MAtom): int {.inline.} = cmp(int(a), int(b)) +func hash*(atom: MAtom): Hash {.borrow.} + +func strToAtom(factory: AtomFactory[MAtom], s: string): MAtom +func tagTypeToAtom(factory: AtomFactory[MAtom], tagType: TagType): MAtom + +proc newMAtomFactory*(): MAtomFactory = + const minCap = int(TagType.high) + 1 + let factory = MAtomFactory( + atomMap: newSeqOfCap[string](minCap), + strToAtomImpl: strToAtom, + tagTypeToAtomImpl: tagTypeToAtom + ) + factory.atomMap.add("") # skip TAG_UNKNOWN + for tagType in TagType(int(TAG_UNKNOWN) + 1) .. TagType.high: + discard factory.strToAtom($tagType) + return factory + +func strToAtom*(factory: MAtomFactory, s: string): MAtom = + let h = s.hash() + let i = h and (factory.strMap.len - 1) + for atom in factory.strMap[i]: + if factory.atomMap[int(atom)] == s: + # Found + return atom + # Not found + let atom = MAtom(factory.atomMap.len) + factory.atomMap.add(s) + factory.strMap[i].add(atom) + return atom + +func strToAtom(factory: AtomFactory[MAtom], s: string): MAtom = + let factory = cast[MAtomFactory](factory) + return factory.strToAtom(s) + +func tagTypeToAtom*(factory: MAtomFactory, tagType: TagType): MAtom = + assert tagType != TAG_UNKNOWN + return MAtom(tagType) + +func tagTypeToAtom(factory: AtomFactory[MAtom], tagType: TagType): MAtom = + let factory = cast[MAtomFactory](factory) + return factory.tagTypeToAtom(tagType) + +func atomToStr*(factory: MAtomFactory, atom: MAtom): string = + return factory.atomMap[int(atom)] + # Node types type Node* = ref object of RootObj @@ -31,6 +100,7 @@ type Comment* = ref object of CharacterData Document* = ref object of Node + factory*: MAtomFactory Text* = ref object of CharacterData @@ -40,14 +110,18 @@ type systemId*: string Element* = ref object of Node - tagType*: TagType - localName*: string + localName*: MAtom namespace*: Namespace attrs*: Table[string, string] + document* {.cursor.}: Document type - MiniDOMBuilder* = ref object of DOMBuilder[Node] + MiniDOMBuilder* = ref object of DOMBuilder[Node, MAtom] document*: Document + factory*: MAtomFactory + +func tagType*(element: Element): TagType = + return element.localName.toTagType() # We use this to validate input strings, since htmltokenizer/htmlparser does no # input validation. @@ -55,7 +129,6 @@ proc toValidUTF8(s: string): string = result = "" var i = 0 while i < s.len: - let u = uint(s[i]) if int(s[i]) < 0x80: result &= s[i] inc i @@ -89,38 +162,41 @@ proc toValidUTF8(s: string): string = result &= "\uFFFD" inc i -proc getDocument(builder: DOMBuilder[Node]): Node = +proc localNameStr*(element: Element): string = + return element.document.factory.atomToStr(element.localName) + +proc getDocument(builder: DOMBuilder[Node, MAtom]): Node = return MiniDOMBuilder(builder).document -proc getParentNode(builder: DOMBuilder[Node], handle: Node): Option[Node] = - return option(handle.parentNode) +proc getAtomFactory(builder: DOMBuilder[Node, MAtom]): AtomFactory[MAtom] = + return MiniDOMBuilder(builder).factory -proc getTagType(builder: DOMBuilder[Node], handle: Node): TagType = - return Element(handle).tagType +proc getParentNode(builder: DOMBuilder[Node, MAtom], handle: Node): Option[Node] = + return option(handle.parentNode) -proc getLocalName(builder: DOMBuilder[Node], handle: Node): string = +proc getLocalName(builder: DOMBuilder[Node, MAtom], handle: Node): MAtom = return Element(handle).localName -proc getNamespace(builder: DOMBuilder[Node], handle: Node): Namespace = +proc getNamespace(builder: DOMBuilder[Node, MAtom], handle: Node): Namespace = return Element(handle).namespace -proc createElement(builder: DOMBuilder[Node], localName: string, - namespace: Namespace, tagType: TagType, - attrs: Table[string, string]): Node = +proc createElement(builder: DOMBuilder[Node, MAtom], localName: MAtom, + namespace: Namespace, attrs: Table[string, string]): Node = + let builder = cast[MiniDOMBuilder](builder) let element = Element( nodeType: ELEMENT_NODE, - localName: localName.toValidUTF8(), + localName: localName, namespace: namespace, - tagType: tagType + document: builder.document ) for k, v in attrs: element.attrs[k.toValidUTF8()] = v.toValidUTF8() return element -proc createComment(builder: DOMBuilder[Node], text: string): Node = +proc createComment(builder: DOMBuilder[Node, MAtom], text: string): Node = return Comment(nodeType: COMMENT_NODE, data: text.toValidUTF8()) -proc createDocumentType(builder: DOMBuilder[Node], name, publicId, +proc createDocumentType(builder: DOMBuilder[Node, MAtom], name, publicId, systemId: string): Node = return DocumentType( nodeType: DOCUMENT_TYPE_NODE, @@ -203,7 +279,7 @@ func preInsertionValidity*(parent, node: Node, before: Node): bool = else: discard return true # no exception reached -proc insertBefore(builder: DOMBuilder[Node], parent, child: Node, +proc insertBefore(builder: DOMBuilder[Node, MAtom], parent, child: Node, before: Option[Node]) = let before = before.get(nil) if parent.preInsertionValidity(child, before): @@ -215,7 +291,7 @@ proc insertBefore(builder: DOMBuilder[Node], parent, child: Node, parent.childList.insert(child, i) child.parentNode = parent -proc insertText(builder: DOMBuilder[Node], parent: Node, text: string, +proc insertText(builder: DOMBuilder[Node, MAtom], parent: Node, text: string, before: Option[Node]) = let text = text.toValidUTF8() let before = before.get(nil) @@ -235,20 +311,20 @@ proc insertText(builder: DOMBuilder[Node], parent: Node, text: string, let text = Text(nodeType: TEXT_NODE, data: text) insertBefore(builder, parent, text, option(before)) -proc remove(builder: DOMBuilder[Node], child: Node) = +proc remove(builder: DOMBuilder[Node, MAtom], child: Node) = if child.parentNode != nil: let i = child.parentNode.childList.find(child) child.parentNode.childList.delete(i) child.parentNode = nil -proc moveChildren(builder: DOMBuilder[Node], fromNode, toNode: Node) = +proc moveChildren(builder: DOMBuilder[Node, MAtom], fromNode, toNode: Node) = let tomove = @(fromNode.childList) fromNode.childList.setLen(0) for child in tomove: child.parentNode = nil insertBefore(builder, toNode, child, none(Node)) -proc addAttrsIfMissing(builder: DOMBuilder[Node], element: Node, +proc addAttrsIfMissing(builder: DOMBuilder[Node, MAtom], element: Node, attrs: Table[string, string]) = let element = Element(element) for k, v in attrs: @@ -258,7 +334,7 @@ proc addAttrsIfMissing(builder: DOMBuilder[Node], element: Node, proc initMiniDOMBuilder*(builder: MiniDOMBuilder) = builder.getDocument = getDocument - builder.getTagType = getTagType + builder.getAtomFactory = getAtomFactory builder.getParentNode = getParentNode builder.getLocalName = getLocalName builder.getNamespace = getNamespace @@ -271,26 +347,27 @@ proc initMiniDOMBuilder*(builder: MiniDOMBuilder) = builder.moveChildren = moveChildren builder.addAttrsIfMissing = addAttrsIfMissing -proc newMiniDOMBuilder*(): MiniDOMBuilder = - let document = Document(nodeType: DOCUMENT_NODE) - let builder = MiniDOMBuilder(document: document) +proc newMiniDOMBuilder*(factory: MAtomFactory): MiniDOMBuilder = + let document = Document(nodeType: DOCUMENT_NODE, factory: factory) + let builder = MiniDOMBuilder(document: document, factory: factory) builder.initMiniDOMBuilder() return builder -proc parseHTML*(inputStream: Stream, opts = HTML5ParserOpts[Node]()): Document = +proc parseHTML*(inputStream: Stream, opts = HTML5ParserOpts[Node](), + factory = newMAtomFactory()): Document = ## Read, parse and return an HTML document from `inputStream`, using - ## parser options `opts`. + ## parser options `opts` and MAtom factory `factory`. ## ## `inputStream` is not required to be seekable. ## ## For a description of `HTML5ParserOpts`, see the `htmlparser` module's ## documentation. - let builder = newMiniDOMBuilder() + let builder = newMiniDOMBuilder(factory) parseHTML(inputStream, builder, opts) return builder.document proc parseHTMLFragment*(inputStream: Stream, element: Element, - opts: HTML5ParserOpts[Node]): seq[Node] = + opts: HTML5ParserOpts[Node], factory = newMAtomFactory()): seq[Node] = ## Read, parse and return the children of an HTML fragment from `inputStream`, ## using context element `element` and parser options `opts`. ## @@ -302,7 +379,7 @@ proc parseHTMLFragment*(inputStream: Stream, element: Element, ## ## Note: the members `ctx`, `initialTokenizerState`, `openElementsInit` and ## `pushInTemplate` of `opts` are overridden (in accordance with the standard). - let builder = newMiniDOMBuilder() + let builder = newMiniDOMBuilder(factory) let document = builder.document let state = case element.tagType of TAG_TITLE, TAG_TEXTAREA: RCDATA @@ -311,7 +388,12 @@ proc parseHTMLFragment*(inputStream: Stream, element: Element, of TAG_NOSCRIPT: DATA # no scripting of TAG_PLAINTEXT: PLAINTEXT else: DATA - let root = Element(nodeType: ELEMENT_NODE, tagType: TAG_HTML, namespace: HTML) + let htmlAtom = builder.factory.tagTypeToAtom(TAG_HTML) + let root = Element( + nodeType: ELEMENT_NODE, + localName: htmlAtom, + namespace: HTML + ) document.childList = @[Node(root)] var opts = opts opts.ctx = some(Node(element)) diff --git a/chame/minidom_cs.nim b/chame/minidom_cs.nim index 871c3f2c..784ca26f 100644 --- a/chame/minidom_cs.nim +++ b/chame/minidom_cs.nim @@ -11,6 +11,7 @@ ## See also ## ======== ## * `chame/minidom <minidom.html>` + import std/streams import std/tables @@ -34,22 +35,20 @@ type CharsetMiniDOMBuilder = ref object of MiniDOMBuilder # decoderstream + encoderstream will always produce valid UTF-8; we define # these separately from minidom to avoid calling toValidUTF8(). -proc createElement(builder: DOMBuilder[Node], localName: string, - namespace: Namespace, tagType: TagType, - attrs: Table[string, string]): Node = +proc createElement(builder: DOMBuilder[Node, MAtom], localName: MAtom, + namespace: Namespace, attrs: Table[string, string]): Node = let element = Element( nodeType: ELEMENT_NODE, localName: localName, namespace: namespace, - tagType: tagType, attrs: attrs ) return element -proc createComment(builder: DOMBuilder[Node], text: string): Node = +proc createComment(builder: DOMBuilder[Node, MAtom], text: string): Node = return Comment(nodeType: COMMENT_NODE, data: text) -proc createDocumentType(builder: DOMBuilder[Node], name, publicId, +proc createDocumentType(builder: DOMBuilder[Node, MAtom], name, publicId, systemId: string): Node = return DocumentType( nodeType: DOCUMENT_TYPE_NODE, @@ -58,13 +57,13 @@ proc createDocumentType(builder: DOMBuilder[Node], name, publicId, systemId: systemId ) -proc addAttrsIfMissing(builder: DOMBuilder[Node], element: Node, +proc addAttrsIfMissing(builder: DOMBuilder[Node, MAtom], element: Node, attrs: Table[string, string]) = let element = Element(element) for k, v in attrs: discard element.attrs.hasKeyOrPut(k, v) -proc setEncoding(builder: DOMBuilder[Node], encoding: string): +proc setEncoding(builder: DOMBuilder[Node, MAtom], encoding: string): SetEncodingResult = let builder = CharsetMiniDOMBuilder(builder) let charset = getCharset(encoding) @@ -82,11 +81,15 @@ proc setEncoding(builder: DOMBuilder[Node], encoding: string): builder.charset = charset return SET_ENCODING_STOP -proc newCharsetMiniDOMBuilder(): CharsetMiniDOMBuilder = +proc newCharsetMiniDOMBuilder(factory: MAtomFactory): CharsetMiniDOMBuilder = let document = Document(nodeType: DOCUMENT_NODE) - let builder = CharsetMiniDOMBuilder(document: document) + let builder = CharsetMiniDOMBuilder(document: document, factory: factory) builder.initMiniDOMBuilder() builder.setEncoding = setEncoding + builder.createElement = createElement + builder.createComment = createComment + builder.createDocumentType = createDocumentType + builder.addAttrsIfMissing = addAttrsIfMissing return builder #TODO this should probably be in decoderstream @@ -107,7 +110,8 @@ proc bomSniff(inputStream: var Stream): Charset = inputStream.setPosition(0) proc parseHTML*(inputStream: Stream, opts: HTML5ParserOpts[Node], - charsets: seq[Charset], seekable = true): Document = + charsets: seq[Charset], seekable = true, + factory = newMAtomFactory()): Document = ## Read, parse and return an HTML document from `inputStream`. ## ## `charsets` is a list of input character sets to try. If empty, it will be @@ -141,7 +145,7 @@ proc parseHTML*(inputStream: Stream, opts: HTML5ParserOpts[Node], ## even `<meta charset=...` tags will be disregarded. ## (TODO: this should be improved in the future; theoretically we could still ## switch between ASCII-compatible charsets before non-ASCII is encountered.) - let builder = newCharsetMiniDOMBuilder() + let builder = newCharsetMiniDOMBuilder(factory) var charsetStack: seq[Charset] for i in countdown(charsets.high, 0): charsetStack.add(charsets[i]) diff --git a/chame/tags.nim b/chame/tags.nim index 767e8f60..1eb41a37 100644 --- a/chame/tags.nim +++ b/chame/tags.nim @@ -1,5 +1,4 @@ import std/tables -import std/strutils type NodeType* = enum @@ -17,27 +16,145 @@ type NOTATION_NODE = 12 TagType* = enum - TAG_UNKNOWN, TAG_APPLET, TAG_BIG, TAG_HTML, TAG_BASE, TAG_BASEFONT, - TAG_BGSOUND, TAG_HEAD, TAG_LINK, TAG_LISTING, TAG_META, TAG_STYLE, - TAG_TITLE, TAG_BODY, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_FOOTER, - TAG_HEADER, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HGROUP, - TAG_MAIN, TAG_NAV, TAG_SEARCH, TAG_SECTION, TAG_BLOCKQUOTE, TAG_DD, - TAG_DIV, TAG_DL, TAG_DT, TAG_FIGCAPTION, TAG_FIGURE, TAG_HR, TAG_LI, - TAG_OL, TAG_P, TAG_PRE, TAG_UL, TAG_A, TAG_ABBR, TAG_B, TAG_BDI, TAG_BDO, - TAG_BR, TAG_NOBR, TAG_CITE, TAG_CODE, TAG_DATA, TAG_DFN, TAG_EM, TAG_EMBED, - TAG_I, TAG_KBD, TAG_MARK, TAG_MARQUEE, TAG_Q, TAG_RB, TAG_RP, TAG_RT, - TAG_RTC, TAG_RUBY, TAG_S, TAG_SAMP, TAG_SMALL, TAG_SPAN, TAG_STRONG, - TAG_SUB, TAG_SUP, TAG_TIME, TAG_U, TAG_VAR, TAG_WBR, TAG_AREA, - TAG_AUDIO, TAG_IMG, TAG_IMAGE, TAG_MAP, TAG_TRACK, TAG_VIDEO, TAG_IFRAME, - TAG_OBJECT, TAG_PARAM, TAG_PICTURE, TAG_PORTAL, TAG_SOURCE, TAG_CANVAS, - TAG_NOSCRIPT, TAG_NOEMBED, TAG_PLAINTEXT, TAG_XMP, TAG_SCRIPT, TAG_DEL, - TAG_INS, TAG_CAPTION, TAG_COL, TAG_COLGROUP, TAG_TABLE, TAG_TBODY, - TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TR, TAG_BUTTON, TAG_DATALIST, - TAG_FIELDSET, TAG_FORM, TAG_INPUT, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, - TAG_METER, TAG_OPTGROUP, TAG_OPTION, TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, - TAG_TEXTAREA, TAG_DETAILS, TAG_DIALOG, TAG_MENU, TAG_SUMMARY, TAG_BLINK, - TAG_CENTER, TAG_CONTENT, TAG_DIR, TAG_FONT, TAG_FRAME, TAG_NOFRAMES, - TAG_FRAMESET, TAG_STRIKE, TAG_TT, TAG_TEMPLATE, TAG_SARCASM + TAG_UNKNOWN = "" + TAG_APPLET = "applet" + TAG_BIG = "big" + TAG_HTML = "html" + TAG_BASE = "base" + TAG_BASEFONT = "basefont" + TAG_BGSOUND = "bgsound" + TAG_HEAD = "head" + TAG_LINK = "link" + TAG_LISTING = "listing" + TAG_META = "meta" + TAG_STYLE = "style" + TAG_TITLE = "title" + TAG_BODY = "body" + TAG_ADDRESS = "address" + TAG_ARTICLE = "article" + TAG_ASIDE = "aside" + TAG_FOOTER = "footer" + TAG_HEADER = "header" + TAG_H1 = "h1" + TAG_H2 = "h2" + TAG_H3 = "h3" + TAG_H4 = "h4" + TAG_H5 = "h5" + TAG_H6 = "h6" + TAG_HGROUP = "hgroup" + TAG_MAIN = "main" + TAG_NAV = "nav" + TAG_SEARCH = "search" + TAG_SECTION = "section" + TAG_BLOCKQUOTE = "blockquote" + TAG_DD = "dd" + TAG_DIV = "div" + TAG_DL = "dl" + TAG_DT = "dt" + TAG_FIGCAPTION = "figcaption" + TAG_FIGURE = "figure" + TAG_HR = "hr" + TAG_LI = "li" + TAG_OL = "ol" + TAG_P = "p" + TAG_PRE = "pre" + TAG_UL = "ul" + TAG_A = "a" + TAG_ABBR = "abbr" + TAG_B = "b" + TAG_BDI = "bdi" + TAG_BDO = "bdo" + TAG_BR = "br" + TAG_NOBR = "nobr" + TAG_CITE = "cite" + TAG_CODE = "code" + TAG_DATA = "data" + TAG_DFN = "dfn" + TAG_EM = "em" + TAG_EMBED = "embed" + TAG_I = "i" + TAG_KBD = "kbd" + TAG_MARK = "mark" + TAG_MARQUEE = "marquee" + TAG_Q = "q" + TAG_RB = "rb" + TAG_RP = "rp" + TAG_RT = "rt" + TAG_RTC = "rtc" + TAG_RUBY = "ruby" + TAG_S = "s" + TAG_SAMP = "samp" + TAG_SMALL = "small" + TAG_SPAN = "span" + TAG_STRONG = "strong" + TAG_SUB = "sub" + TAG_SUP = "sup" + TAG_TIME = "time" + TAG_U = "u" + TAG_VAR = "var" + TAG_WBR = "wbr" + TAG_AREA = "area" + TAG_AUDIO = "audio" + TAG_IMG = "img" + TAG_IMAGE = "image" + TAG_MAP = "map" + TAG_TRACK = "track" + TAG_VIDEO = "video" + TAG_IFRAME = "iframe" + TAG_OBJECT = "object" + TAG_PARAM = "param" + TAG_PICTURE = "picture" + TAG_PORTAL = "portal" + TAG_SOURCE = "source" + TAG_CANVAS = "canvas" + TAG_NOSCRIPT = "noscript" + TAG_NOEMBED = "noembed" + TAG_PLAINTEXT = "plaintext" + TAG_XMP = "xmp" + TAG_SCRIPT = "script" + TAG_DEL = "del" + TAG_INS = "ins" + TAG_CAPTION = "caption" + TAG_COL = "col" + TAG_COLGROUP = "colgroup" + TAG_TABLE = "table" + TAG_TBODY = "tbody" + TAG_TD = "td" + TAG_TFOOT = "tfoot" + TAG_TH = "th" + TAG_THEAD = "thead" + TAG_TR = "tr" + TAG_BUTTON = "button" + TAG_DATALIST = "datalist" + TAG_FIELDSET = "fieldset" + TAG_FORM = "form" + TAG_INPUT = "input" + TAG_KEYGEN = "keygen" + TAG_LABEL = "label" + TAG_LEGEND = "legend" + TAG_METER = "meter" + TAG_OPTGROUP = "optgroup" + TAG_OPTION = "option" + TAG_OUTPUT = "output" + TAG_PROGRESS = "progress" + TAG_SELECT = "select" + TAG_TEXTAREA = "textarea" + TAG_DETAILS = "details" + TAG_DIALOG = "dialog" + TAG_MENU = "menu" + TAG_SUMMARY = "summary" + TAG_BLINK = "blink" + TAG_CENTER = "center" + TAG_CONTENT = "content" + TAG_DIR = "dir" + TAG_FONT = "font" + TAG_FRAME = "frame" + TAG_NOFRAMES = "noframes" + TAG_FRAMESET = "frameset" + TAG_STRIKE = "strike" + TAG_TT = "tt" + TAG_TEMPLATE = "template" + TAG_SARCASM = "sarcasm" QuirksMode* = enum NO_QUIRKS, QUIRKS, LIMITED_QUIRKS @@ -53,31 +170,20 @@ type func getTagTypeMap(): Table[string, TagType] = for i in TagType: - let enumname = $TagType(i) - let tagname = enumname.split('_')[1..^1].join("_").toLowerAscii() - result[tagname] = TagType(i) + result[$TagType(i)] = TagType(i) const tagTypeMap = getTagTypeMap() func tagType*(s: string): TagType = - if tagTypeMap.hasKey(s): + if s in tagTypeMap: return tagTypeMap[s] - else: - return TAG_UNKNOWN - -const tagNameMap = (func(): Table[TagType, string] = - for k, v in tagTypeMap: - result[v] = k -)() + return TAG_UNKNOWN const AllTagTypes* = (func(): set[TagType] = for tag in TagType: result.incl(tag) )() -func tagName*(t: TagType): string = - return tagNameMap[t] - const HTagTypes* = { TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6 } diff --git a/tests/shared/tree_common.nim b/tests/shared/tree_common.nim index bfc961f4..2eb7d476 100644 --- a/tests/shared/tree_common.nim +++ b/tests/shared/tree_common.nim @@ -33,6 +33,7 @@ type s: string i: int pi: int + factory: MAtomFactory func has(ctx: TCTestParser): bool = return ctx.i < ctx.s.len @@ -95,9 +96,8 @@ proc parseTestFragment(ctx: var TCTestParser): TCFragment = of FT_HTML: Namespace.HTML let element = Element( nodeType: ELEMENT_NODE, - tagType: tagType(line), namespace: namespace, - localName: line + localName: ctx.factory.strToAtom(line) ) return TCFragment( fragmentType: fragmentType, @@ -141,7 +141,7 @@ proc parseComment(s: string): Comment = ) proc parseTestDocument(ctx: var TCTestParser): Document = - result = Document(nodeType: DOCUMENT_NODE) + result = Document(nodeType: DOCUMENT_NODE, factory: ctx.factory) var stack: seq[Node] stack.add(result) template top: auto = stack[^1] @@ -172,12 +172,12 @@ proc parseTestDocument(ctx: var TCTestParser): Document = elif str.startsWith("<?"): assert false, "todo" elif str.startsWith("<"): - let tag = str.substr(1, str.high - 1) + let tagName = ctx.factory.strToAtom(str.substr(1, str.high - 1)) let element = Element( nodeType: ELEMENT_NODE, - tagType: tagType(tag), namespace: HTML, - localName: tag + localName: tagName, + document: result ) top.childList.add(element) stack.add(element) @@ -221,9 +221,9 @@ proc parseTest(ctx: var TCTestParser): TCTest = break return t -proc parseTests*(s: string): seq[TCTest] = +proc parseTests*(s: string, factory: MAtomFactory): seq[TCTest] = result = @[] - var parser = TCTestParser(s: s) + var parser = TCTestParser(s: s, factory: factory) while parser.i < s.len: let test = parser.parseTest() result.add(test) @@ -238,10 +238,7 @@ proc checkTest(nodein, nodep: Node) = of ELEMENT_NODE: let nodein = Element(nodein) let nodep = Element(nodep) - check nodein.tagType == nodep.tagType - #TODO figure out a better scheme - if nodein.tagType == TAG_UNKNOWN: - check nodein.localName == nodep.localName + check nodein.localName == nodep.localName check nodein.namespace == nodep.namespace check nodein.attrs == nodep.attrs of ATTRIBUTE_NODE, ENTITY_REFERENCE_NODE, ENTITY_NODE, diff --git a/tests/test1.nim b/tests/test1.nim index 22b7bae8..803c4c69 100644 --- a/tests/test1.nim +++ b/tests/test1.nim @@ -3,7 +3,6 @@ import unittest import tables -import strutils import streams import chame/tags @@ -33,26 +32,17 @@ func escapeText(s: string, attribute_mode = false): string = else: result &= c -proc tostr(ftype: enum): string = - return ($ftype).split('_')[1..^1].join("-").toLowerAscii() - func `$`*(node: Node): string = case node.nodeType of ELEMENT_NODE: let element = Element(node) - if element.tagType != TAG_UNKNOWN: - result = "<" & $element.tagType.tostr() - else: - result = "<" & element.localName + result = "<" & element.localNameStr for k, v in element.attrs: result &= ' ' & k & "=\"" & v.escapeText(true) & "\"" result &= ">" for node in element.childList: result &= $node - if element.tagType != TAG_UNKNOWN: - result &= "</" & $element.tagType.tostr() & ">" - else: - result &= "</" & $element.localName & ">" + result &= "</" & element.localNameStr & ">" of TEXT_NODE: let text = Text(node) result = text.data.escapeText() diff --git a/tests/tokenizer.nim b/tests/tokenizer.nim index 2bf43436..ec083bd4 100644 --- a/tests/tokenizer.nim +++ b/tests/tokenizer.nim @@ -6,8 +6,8 @@ import unicode import unittest import chame/htmltokenizer +import chame/minidom import chame/parseerror -import chame/tags const hexCharMap = (func(): array[char, int] = for i in 0..255: @@ -58,33 +58,32 @@ proc getAttrs(o: JsonNode, esc: bool): Table[string, string] = else: result[k] = v.getStr() -proc getToken(a: seq[JsonNode], esc: bool): Token = +proc getToken(factory: MAtomFactory, a: seq[JsonNode], esc: bool): + Token[MAtom] = case a[0].getStr() of "StartTag": - return Token( + return Token[MAtom]( t: START_TAG, - tagname: a[1].getStr(), - tagtype: tagType(a[1].getStr()), + tagname: factory.strToAtom(a[1].getStr()), attrs: getAttrs(a[2], esc), selfclosing: a.len > 3 and a[3].getBool() ) of "EndTag": - return Token( + return Token[MAtom]( t: END_TAG, - tagname: a[1].getStr(), - tagtype: tagType(a[1].getStr()), + tagname: factory.strToAtom(a[1].getStr()) ) of "Character": let s = if esc: doubleEscape(a[1].getStr()) else: a[1].getStr() - return Token( + return Token[MAtom]( t: CHARACTER, s: s ) of "DOCTYPE": - return Token( + return Token[MAtom]( t: DOCTYPE, quirks: not a[4].getBool(), # yes, this is reversed. don't ask name: if a[1].kind == JNull: none(string) else: some(a[1].getStr()), @@ -96,13 +95,13 @@ proc getToken(a: seq[JsonNode], esc: bool): Token = doubleEscape(a[1].getStr()) else: a[1].getStr() - return Token( + return Token[MAtom]( t: COMMENT, data: s ) else: discard -proc checkEquals(tok, otok: Token, desc: string) = +proc checkEquals(factory: MAtomFactory, tok, otok: Token, desc: string) = doAssert otok.t == tok.t, desc & " (tok t: " & $tok.t & " otok t: " & $otok.t & ")" case tok.t @@ -115,8 +114,8 @@ proc checkEquals(tok, otok: Token, desc: string) = doAssert tok.quirks == otok.quirks, desc of TokenType.START_TAG, TokenType.END_TAG: doAssert tok.tagname == otok.tagname, desc & " (tok tagname: " & - tok.tagname & " otok tagname " & otok.tagname & ")" - doAssert tok.tagtype == otok.tagtype, desc + factory.atomToStr(tok.tagname) & " otok tagname " & + factory.atomToStr(otok.tagname) & ")" if tok.t == TokenType.START_TAG: #TODO not sure if this is the best solution. but end tags can't really # be self-closing... @@ -132,36 +131,36 @@ proc checkEquals(tok, otok: Token, desc: string) = "otok data: " & otok.data & ")" of EOF, CHARACTER_NULL: discard -proc runTest(desc, input: string, output: seq[JsonNode], laststart: string, - esc: bool, state: TokenizerState = DATA) = +proc runTest(factory: MAtomFactory, desc, input: string, output: seq[JsonNode], + laststart: MAtom, esc: bool, state: TokenizerState = DATA) = let ds = newStringStream(input) proc onParseError(e: ParseError) = discard - var tokenizer = newTokenizer(ds, onParseError, state) - tokenizer.laststart = Token(t: START_TAG, tagname: laststart) + var tokenizer = newTokenizer(ds, onParseError, factory, state) + tokenizer.laststart = Token[MAtom](t: START_TAG, tagname: laststart) var i = 0 - var chartok: Token = nil + var chartok: Token[MAtom] = nil for tok in tokenizer.tokenize: check tok != nil if chartok != nil and tok.t notin {CHARACTER, CHARACTER_WHITESPACE, CHARACTER_NULL}: - let otok = getToken(output[i].getElems(), esc) - checkEquals(chartok, otok, desc) + let otok = getToken(factory, output[i].getElems(), esc) + checkEquals(factory, chartok, otok, desc) inc i chartok = nil if tok.t == EOF: break # html5lib-tests has no EOF tokens elif tok.t in {CHARACTER, CHARACTER_WHITESPACE}: if chartok == nil: - chartok = Token(t: CHARACTER) + chartok = Token[MAtom](t: CHARACTER) chartok.s &= tok.s elif tok.t == CHARACTER_NULL: if chartok == nil: - chartok = Token(t: CHARACTER) + chartok = Token[MAtom](t: CHARACTER) chartok.s &= char(0) else: - let otok = getToken(output[i].getElems(), esc) - checkEquals(tok, otok, desc) + let otok = getToken(factory, output[i].getElems(), esc) + checkEquals(factory, tok, otok, desc) inc i func getState(s: string): TokenizerState = @@ -192,16 +191,18 @@ proc runTests(filename: string) = if esc: input = doubleEscape(input) let output = t{"output"}.getElems() - let laststart = if "lastStartTag" in t: + let laststart0 = if "lastStartTag" in t: t{"lastStartTag"}.getStr() else: "" + let factory = newMAtomFactory() + let laststart = factory.strToAtom(laststart0) if "initialStates" notin t: - runTest(desc, input, output, laststart, esc) + runTest(factory, desc, input, output, laststart, esc) else: for state in t{"initialStates"}: let state = getState(state.getStr()) - runTest(desc, input, output, laststart, esc, state) + runTest(factory, desc, input, output, laststart, esc, state) test "contentModelFlags": runTests("contentModelFlags.test") diff --git a/tests/tree.nim b/tests/tree.nim index 9ccfd92a..14ae0cf7 100644 --- a/tests/tree.nim +++ b/tests/tree.nim @@ -2,17 +2,17 @@ include shared/tree_common import std/streams -proc runTest(test: TCTest, scripting: bool) = +proc runTest(test: TCTest, factory: MAtomFactory, scripting: bool) = let ss = newStringStream(test.data) let opts = HTML5ParserOpts[Node]( scripting: scripting ) let pdoc = if test.fragment.isNone: - parseHTML(ss, opts) + parseHTML(ss, opts, factory) else: let ctx = Element() ctx[] = test.fragment.get.ctx[] - let childList = parseHTMLFragment(ss, ctx, opts) + let childList = parseHTMLFragment(ss, ctx, opts, factory) for child in childList: if ctx.preInsertionValidity(child, nil): ctx.childList.add(child) @@ -33,16 +33,17 @@ proc runTest(test: TCTest, scripting: bool) = const rootpath = "tests/html5lib-tests/tree-construction/" proc runTests(filename: string) = - let tests = parseTests(readFile(rootpath & filename)) + let factory = newMAtomFactory() + let tests = parseTests(readFile(rootpath & filename), factory) for test in tests: case test.script of SCRIPT_OFF: - test.runTest(scripting = false) + test.runTest(factory, scripting = false) of SCRIPT_ON: - test.runTest(scripting = true) + test.runTest(factory, scripting = true) of SCRIPT_BOTH: - test.runTest(scripting = false) - test.runTest(scripting = true) + test.runTest(factory, scripting = false) + test.runTest(factory, scripting = true) test "tests1.dat": runTests("tests1.dat") diff --git a/tests/tree_charset.nim b/tests/tree_charset.nim index 53ba14d7..c5e8b36b 100644 --- a/tests/tree_charset.nim +++ b/tests/tree_charset.nim @@ -4,7 +4,8 @@ import std/streams import chame/minidom_cs import chakasu/charset -proc runTest(test: TCTest, scripting: bool, labels: openArray[string]) = +proc runTest(test: TCTest, factory: MAtomFactory, scripting: bool, + labels: openArray[string]) = let ss = newStringStream(test.data) let opts = HTML5ParserOpts[Node]( scripting: scripting @@ -32,16 +33,17 @@ proc runTest(test: TCTest, scripting: bool, labels: openArray[string]) = const rootpath = "tests/" proc runTests(filename: string, labels: openArray[string]) = - let tests = parseTests(readFile(rootpath & filename)) + let factory = newMAtomFactory() + let tests = parseTests(readFile(rootpath & filename), factory) for test in tests: case test.script of SCRIPT_OFF: - test.runTest(scripting = false, labels) + test.runTest(factory, scripting = false, labels) of SCRIPT_ON: - test.runTest(scripting = true, labels) + test.runTest(factory, scripting = true, labels) of SCRIPT_BOTH: - test.runTest(scripting = false, labels) - test.runTest(scripting = true, labels) + test.runTest(factory, scripting = false, labels) + test.runTest(factory, scripting = true, labels) test "sjis.dat": runTests("sjis.dat", ["utf8", "sjis", "latin1"]) |