Implement standard-compliant HTML tokenizer

Seems performant enough, though I'm not sure whether it is actually producing correct output. Still missing: tests and the actual parsing.
author: bptato <nincsnevem662@gmail.com> 2022-06-27 19:00:19 +0200
committer: bptato <nincsnevem662@gmail.com> 2022-06-27 23:53:32 +0200
commit: 84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56 (patch)
tree: 0ad531e10dbee632a3cacc309cc7fae7df7442a6 /src/utils
parent: 52c185c83d8e372af7f68fcc4df1ac2f20985e0f (diff)
download: chawan-84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56.tar.gz
2 files changed, 25 insertions, 12 deletions
diff --git a/src/utils/radixtree.nim b/src/utils/radixtree.nim
index 49072d65..f4ef5fb0 100644
--- a/src/utils/radixtree.nim
+++ b/src/utils/radixtree.nim
@@ -139,8 +139,8 @@ proc `[]=`*[T](tree: RadixNode[T], key: string, value: T) =
 func `{}`*[T](node: RadixNode[T], key: string): RadixNode[T] =
   return node.getOrDefault(key, node)
 
-func hasPrefix*[T](tree: RadixNode[T], prefix: string, at: RadixNode[T] = tree): bool =
-  var n = at
+func hasPrefix*[T](node: RadixNode[T], prefix: string): bool =
+  var n = node
   var i = 0
 
   while i < prefix.len:
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index a48f80b2..04db1d24 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -28,8 +28,20 @@ func ansiReset*(str: string): string =
   result &= str
   result &= ansiResetCode
 
+const C0Controls* = {chr(0x00)..chr(0x1F)}
+const Controls* = (C0Controls + {chr(0x7F)})
+const Ascii* = {chr(0x00)..chr(0x7F)}
+const AsciiUpperAlpha* = {'A'..'Z'}
+const AsciiLowerAlpha* = {'a'..'z'}
+const AsciiAlpha* = (AsciiUpperAlpha + AsciiLowerAlpha)
+const AllChars = {chr(0x00)..chr(0xFF)}
+const NonAscii = (AllChars - Ascii)
+const AsciiDigit* = {'0'..'9'}
+const AsciiHexDigit* = (AsciiDigit + {'a'..'f', 'A'..'F'})
+const AsciiWhitespace* = {' ', '\n', '\r', '\t', '\f'}
+
 func isWhitespace*(c: char): bool {.inline.} =
-  return c in {' ', '\n', '\r', '\t', '\f'}
+  return c in AsciiWhitespace
 
 func onlyWhitespace*(s: string): bool =
   for c in s:
@@ -37,15 +49,6 @@ func onlyWhitespace*(s: string): bool =
       return false
   return true
 
-const C0Controls = {chr(0x00)..chr(0x1F)}
-const Controls = (C0Controls + {chr(0x7F)})
-const Ascii* = {chr(0x00)..chr(0x7F)}
-const Letters = {'A'..'Z', 'a'..'z'}
-const AllChars = {chr(0x00)..chr(0xFF)}
-const NonAscii = (AllChars - Ascii)
-const Digits = {'0'..'9'}
-const HexDigits = (Digits + {'a'..'f', 'A'..'F'})
-
 func isControlChar*(c: char): bool =
   return c in Controls
 
@@ -404,6 +407,16 @@ func parseFloat64*(s: string): float64 =
 
   return float64(sign) * (integer + f * pow(10, float64(-d))) * pow(10, (float64(t) * e))
 
+func isSurrogate*(r: Rune): bool = int32(r) in 0xD800..0xDFFF
+func isNonCharacter*(r: Rune): bool =
+  let n = int32(r)
+  n in 0xFDD0..0xFDEF or
+  n in [0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
+        0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF,
+        0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF,
+        0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+        0x10FFFE, 0x10FFFF]
+
 const ControlPercentEncodeSet* = (Controls + NonAscii)
 const FragmentPercentEncodeSet* = (Controls + NonAscii)
 const QueryPercentEncodeSet* = (ControlPercentEncodeSet + {' ', '"', '#', '<', '>'})
author	bptato <nincsnevem662@gmail.com>	2022-06-27 19:00:19 +0200
committer	bptato <nincsnevem662@gmail.com>	2022-06-27 23:53:32 +0200
commit	84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56 (patch)
tree	0ad531e10dbee632a3cacc309cc7fae7df7442a6 /src/utils
parent	52c185c83d8e372af7f68fcc4df1ac2f20985e0f (diff)
download	chawan-84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56.tar.gz