diff options
author | bptato <nincsnevem662@gmail.com> | 2022-06-27 19:00:19 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2022-06-27 23:53:32 +0200 |
commit | 84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56 (patch) | |
tree | 0ad531e10dbee632a3cacc309cc7fae7df7442a6 /src/utils | |
parent | 52c185c83d8e372af7f68fcc4df1ac2f20985e0f (diff) | |
download | chawan-84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56.tar.gz |
Implement standard-compliant HTML tokenizer
Seems performant enough, though I'm not sure whether it is actually producing correct output. Still missing: tests and the actual parsing.
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/radixtree.nim | 4 | ||||
-rw-r--r-- | src/utils/twtstr.nim | 33 |
2 files changed, 25 insertions, 12 deletions
diff --git a/src/utils/radixtree.nim b/src/utils/radixtree.nim index 49072d65..f4ef5fb0 100644 --- a/src/utils/radixtree.nim +++ b/src/utils/radixtree.nim @@ -139,8 +139,8 @@ proc `[]=`*[T](tree: RadixNode[T], key: string, value: T) = func `{}`*[T](node: RadixNode[T], key: string): RadixNode[T] = return node.getOrDefault(key, node) -func hasPrefix*[T](tree: RadixNode[T], prefix: string, at: RadixNode[T] = tree): bool = - var n = at +func hasPrefix*[T](node: RadixNode[T], prefix: string): bool = + var n = node var i = 0 while i < prefix.len: diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim index a48f80b2..04db1d24 100644 --- a/src/utils/twtstr.nim +++ b/src/utils/twtstr.nim @@ -28,8 +28,20 @@ func ansiReset*(str: string): string = result &= str result &= ansiResetCode +const C0Controls* = {chr(0x00)..chr(0x1F)} +const Controls* = (C0Controls + {chr(0x7F)}) +const Ascii* = {chr(0x00)..chr(0x7F)} +const AsciiUpperAlpha* = {'A'..'Z'} +const AsciiLowerAlpha* = {'a'..'z'} +const AsciiAlpha* = (AsciiUpperAlpha + AsciiLowerAlpha) +const AllChars = {chr(0x00)..chr(0xFF)} +const NonAscii = (AllChars - Ascii) +const AsciiDigit* = {'0'..'9'} +const AsciiHexDigit* = (AsciiDigit + {'a'..'f', 'A'..'F'}) +const AsciiWhitespace* = {' ', '\n', '\r', '\t', '\f'} + func isWhitespace*(c: char): bool {.inline.} = - return c in {' ', '\n', '\r', '\t', '\f'} + return c in AsciiWhitespace func onlyWhitespace*(s: string): bool = for c in s: @@ -37,15 +49,6 @@ func onlyWhitespace*(s: string): bool = return false return true -const C0Controls = {chr(0x00)..chr(0x1F)} -const Controls = (C0Controls + {chr(0x7F)}) -const Ascii* = {chr(0x00)..chr(0x7F)} -const Letters = {'A'..'Z', 'a'..'z'} -const AllChars = {chr(0x00)..chr(0xFF)} -const NonAscii = (AllChars - Ascii) -const Digits = {'0'..'9'} -const HexDigits = (Digits + {'a'..'f', 'A'..'F'}) - func isControlChar*(c: char): bool = return c in Controls @@ -404,6 +407,16 @@ func parseFloat64*(s: string): float64 = return float64(sign) * (integer + f * pow(10, float64(-d))) * pow(10, (float64(t) * e)) +func isSurrogate*(r: Rune): bool = int32(r) in 0xD800..0xDFFF +func isNonCharacter*(r: Rune): bool = + let n = int32(r) + n in 0xFDD0..0xFDEF or + n in [0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, + 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, + 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, + 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, + 0x10FFFE, 0x10FFFF] + const ControlPercentEncodeSet* = (Controls + NonAscii) const FragmentPercentEncodeSet* = (Controls + NonAscii) const QueryPercentEncodeSet* = (ControlPercentEncodeSet + {' ', '"', '#', '<', '>'}) |