about summary refs log tree commit diff stats
path: root/src/html
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2024-02-12 17:03:35 +0100
committerbptato <nincsnevem662@gmail.com>2024-02-12 17:03:35 +0100
commit8e6783a45fba48dd8f63fe7486e4691f05220b52 (patch)
tree5aae9f9f95432609a497eea858c4a3401dac172b /src/html
parent69b1a7e7f6e0a675cd70805768162de5621e8279 (diff)
downloadchawan-8e6783a45fba48dd8f63fe7486e4691f05220b52.tar.gz
Remove CLONE BufferSource; cache document sources in tmpdir
At last all BufferSources are unified.

To achieve the same effect as the previous CLONE source type, we now
use the "fromcache" flag in Request. This *forces* the document to be
streamed from the disk; if the file no longer exists for some reason,
an error is returned (i.e. the document is not re-downloaded).

For a document to be cached, it has to be the main document of the
buffer (i.e. no additional resources requested with fetch()), and
also not an x-htmloutput HTML file (for those, the original source is
saved). The result is that toggleSource now always returns the actual
source for e.g. markdown files, not the HTML-transformed version.

Also, it is now possible to view the source of a document that is
still being downloaded.

buffer.sstream has almost been eliminated; it still exists, but only as
a pseudo-buffer to interface with EncoderStream and DecoderStream. It no
longer holds the entire source of a buffer at any point, and is cleared
as soon as the buffer is completely loaded.
Diffstat (limited to 'src/html')
-rw-r--r--src/html/chadombuilder.nim64
1 files changed, 45 insertions, 19 deletions
diff --git a/src/html/chadombuilder.nim b/src/html/chadombuilder.nim
index 044d8643..d604f455 100644
--- a/src/html/chadombuilder.nim
+++ b/src/html/chadombuilder.nim
@@ -29,12 +29,15 @@ type
     seekable: bool
     builder*: ChaDOMBuilder
     opts: HTML5ParserOpts[Node, CAtom]
-    inputStream: Stream
+    stream: StringStream
     encoder: EncoderStream
     decoder: DecoderStream
+    rewindImpl: proc()
     # hack so we don't have to worry about leaks or the GC deallocating parser
     refs: seq[Document]
     stoppedFromScript: bool
+    needsBOMSniff: bool
+    wasICE: bool # inhibitCheckEnd
 
   ChaDOMBuilder = ref object of DOMBuilder[Node, CAtom]
     charset: Charset
@@ -260,17 +263,21 @@ proc parseHTMLFragment*(element: Element, s: string): seq[Node] =
   builder.finish()
   return root.childList
 
-#TODO this should be handled by decoderstream
-proc bomSniff(inputStream: Stream): Charset =
-  let bom = inputStream.readStr(2)
+#TODO this should be handled by decoderstream or buffer
+proc bomSniff(wrapper: HTML5ParserWrapper): Charset =
+  let stream = wrapper.stream
+  let op = stream.getPosition()
+  if op + 2 >= stream.data.len:
+    return CHARSET_UNKNOWN
+  let bom = stream.readStr(2)
   if bom == "\xFE\xFF":
     return CHARSET_UTF_16_BE
   if bom == "\xFF\xFE":
     return CHARSET_UTF_16_LE
   if bom == "\xEF\xBB":
-    if inputStream.readChar() == '\xBF':
+    if op + 3 < stream.data.len and stream.readChar() == '\xBF':
       return CHARSET_UTF_8
-  inputStream.setPosition(0)
+  wrapper.stream.setPosition(op)
   return CHARSET_UNKNOWN
 
 proc switchCharset(wrapper: HTML5ParserWrapper) =
@@ -284,16 +291,18 @@ proc switchCharset(wrapper: HTML5ParserWrapper) =
     DECODER_ERROR_MODE_REPLACEMENT
   else:
     DECODER_ERROR_MODE_FATAL
+  let ice = wrapper.decoder == nil or wrapper.wasICE
   wrapper.parser = initHTML5Parser(builder, wrapper.opts)
-  wrapper.decoder = newDecoderStream(wrapper.inputStream, builder.charset,
+  wrapper.decoder = newDecoderStream(wrapper.stream, builder.charset,
     errormode = em)
-  wrapper.decoder.setInhibitCheckEnd(true)
+  wrapper.decoder.setInhibitCheckEnd(ice)
+  wrapper.wasICE = ice
   wrapper.encoder = newEncoderStream(wrapper.decoder, CHARSET_UTF_8,
     errormode = ENCODER_ERROR_MODE_FATAL)
 
-proc newHTML5ParserWrapper*(inputStream: Stream, window: Window, url: URL,
-    factory: CAtomFactory, charsets: seq[Charset] = @[], seekable = true):
-    HTML5ParserWrapper =
+proc newHTML5ParserWrapper*(stream: StringStream, window: Window, url: URL,
+    factory: CAtomFactory, rewindImpl: proc(), charsets: seq[Charset],
+    seekable: bool): HTML5ParserWrapper =
   let opts = HTML5ParserOpts[Node, CAtom](
     isIframeSrcdoc: false, #TODO?
     scripting: window != nil and window.settings.scripting
@@ -303,14 +312,12 @@ proc newHTML5ParserWrapper*(inputStream: Stream, window: Window, url: URL,
     seekable: seekable,
     builder: builder,
     opts: opts,
-    inputStream: inputStream
+    stream: stream,
+    rewindImpl: rewindImpl,
+    needsBOMSniff: seekable
   )
   builder.document.setActiveParser(wrapper)
-  if seekable and (let scs = inputStream.bomSniff(); scs != CHARSET_UNKNOWN):
-    builder.confidence = ccCertain
-    wrapper.charsetStack = @[scs]
-    wrapper.seekable = false
-  elif charsets.len == 0:
+  if charsets.len == 0:
     wrapper.charsetStack = @[DefaultCharset] # UTF-8
   else:
     for i in countdown(charsets.high, 0):
@@ -385,13 +392,23 @@ proc CDB_parseDocumentWriteChunk(wrapper: pointer) {.exportc.} =
 
 proc parseAll*(wrapper: HTML5ParserWrapper) =
   let builder = wrapper.builder
+  if wrapper.needsBOMSniff:
+    if wrapper.stream.getPosition() + 3 >= wrapper.stream.data.len:
+      return
+    let scs = wrapper.bomSniff()
+    if scs != CHARSET_UNKNOWN:
+      builder.confidence = ccCertain
+      wrapper.charsetStack = @[scs]
+      wrapper.seekable = false
+      wrapper.switchCharset()
+    wrapper.needsBOMSniff = false
   while true:
     let buffer = wrapper.encoder.readAll()
     if wrapper.decoder.failed:
       assert wrapper.seekable
       # Retry with another charset.
       builder.restart(wrapper)
-      wrapper.inputStream.setPosition(0)
+      wrapper.rewindImpl()
       wrapper.switchCharset()
       continue
     if buffer.len == 0:
@@ -402,13 +419,22 @@ proc parseAll*(wrapper: HTML5ParserWrapper) =
     # res == PRES_STOP: A meta tag describing the charset has been found; force
     # use of this charset.
     builder.restart(wrapper)
-    wrapper.inputStream.setPosition(0)
+    wrapper.rewindImpl()
     wrapper.charsetStack.add(builder.charset)
     wrapper.seekable = false
     wrapper.switchCharset()
 
 proc finish*(wrapper: HTML5ParserWrapper) =
+  if wrapper.needsBOMSniff:
+    let scs = wrapper.bomSniff()
+    if scs != CHARSET_UNKNOWN:
+      wrapper.builder.confidence = ccCertain
+      wrapper.charsetStack = @[scs]
+      wrapper.seekable = false
+      wrapper.switchCharset()
+    wrapper.needsBOMSniff = false
   wrapper.decoder.setInhibitCheckEnd(false)
+  wrapper.wasICE = false
   wrapper.parseAll()
   wrapper.parser.finish()
   wrapper.builder.finish()