## A demonstration of using the Chagashi encoding library in combination
## with the Chame HTML parser.
##
## For the most part, this is the same as minidom, except it supports
## decoding documents with arbitrary character sets.
##
## Note: this is not implemented for the fragment parsing algorithm,
## because that is only defined for UTF-8 in the standard.
##
## For a version without the encoding library dependency, see
## [minidom](minidom.html).
import std/streams
import minidom
import htmlparser
import tags
import chagashi/charset
import chagashi/decoder
export minidom
export tags
type CharsetConfidence = enum
ccTentative, ccCertain
type CharsetMiniDOMBuilder = ref object of MiniDOMBuilder
charset: Charset
confidence: CharsetConfidence
method setEncodingImpl(builder: CharsetMiniDOMBuilder, encoding: string):
SetEncodingResult =
let charset = getCharset(encoding)
if charset == CHARSET_UNKNOWN:
return SET_ENCODING_CONTINUE
if builder.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}:
builder.confidence = ccCertain
return SET_ENCODING_CONTINUE
builder.confidence = ccCertain
if charset == builder.charset:
return SET_ENCODING_CONTINUE
if charset == CHARSET_X_USER_DEFINED:
builder.charset = CHARSET_WINDOWS_1252
else:
builder.charset = charset
return SET_ENCODING_STOP
proc newCharsetMiniDOMBuilder(factory: MAtomFactory): CharsetMiniDOMBuilder =
let document = Document(factory: factory)
let builder = CharsetMiniDOMBuilder(document: document, factory: factory)
return builder
#TODO this should be handled by decoderstream
proc bomSniff(inputStream: Stream): Charset =
let bom = inputStream.readStr(2)
if bom == "\xFE\xFF":
return CHARSET_UTF_16_BE
if bom == "\xFF\xFE":
return CHARSET_UTF_16_LE
if bom == "\xEF\xBB":
if inputStream.readChar() == '\xBF':
return CHARSET_UTF_8
inputStream.setPosition(0)
return CHARSET_UNKNOWN
proc parseHTML*(inputStream: Stream, opts: HTML5ParserOpts[Node, MAtom],
charsets: seq[Charset], seekable = true,
factory = newMAtomFactory()): Document =
## Read, parse and return an HTML document from `inputStream`.
##
## `charsets` is a list of input character sets to try. If empty, it will be
## initialized to `@[CHARSET_UTF_8]`.
##
## The list of fallback charsets is used as follows:
##
## * A charset stack is initialized to `charsets`, reversed. This
## means that the first charset specified in `charsets` is on top of
## the stack. (e.g. say `charsets = @[CHARSET_UTF_16_LE, CHARSET_UTF_8]`,
## then utf-16-le is tried before utf-8.)
## * BOM sniffing is attempted. If successful, confidence is set to
## certain and the resulting charset is used (i.e. other character
## sets will not be tried for decoding this document.)
## * If the charset stack is empty, UTF-8 is pushed on top.
## * Attempt to parse the document with the first charset on top of
## the stack.
## * If BOM sniffing was unsuccessful, and a tag
## is encountered, parsing is restarted with the specified charset.
## No further attempts are made to detect the encoding, and decoder
## errors are signaled by U+FFFD replacement characters.
## * Otherwise, each charset on the charset stack is tried until either no
## decoding errors are encountered, or only one charset is left. For
## the last charset, decoder errors are signaled by U+FFFD replacement
## characters.
##
## `seekable` must be true only if `inputStream` is seekable; if set to true,
## `inputStream.setPosition(0)` must work.
##
## Note that `seekable = false` disables automatic character set detection;
## even ` 0:
builder.confidence = ccTentative # used in the next iteration
else:
builder.confidence = ccCertain
var parser = initHTML5Parser(builder, opts)
var iq {.noinit.}: array[4096, char]
let decoder = newTextDecoder(builder.charset)
let errorMode = [
ccTentative: demFatal,
ccCertain: demReplacement
][builder.confidence]
var ctx = initTextDecoderContext(decoder, errorMode)
while true:
let n = inputStream.readData(addr iq[0], iq.len)
var finish = n < iq.len
for chunk in ctx.decode(iq.toOpenArrayByte(0, n - 1), finish = finish):
# res can be PRES_SCRIPT, PRES_STOP or PRES_CONTINUE.
var res = parser.parseChunk(chunk.toOpenArray())
# For PRES_SCRIPT, we must re-feed the same chunk as in minidom, but
# starting from the current insertion point.
var ip = 0
while res == PRES_SCRIPT and
(ip += parser.getInsertionPoint(); ip != chunk.len):
res = parser.parseChunk(chunk.toOpenArray(ip, chunk.high))
# PRES_STOP is returned when we return SET_ENCODING_STOP from
# setEncodingImpl. We immediately stop parsing in this case.
if res == PRES_STOP:
finish = true
break
if finish:
break
parser.finish()
if builder.confidence == ccCertain and seekable:
# A meta tag describing the charset has been found; force use of this
# charset.
inputStream.setPosition(0)
builder.document = Document(factory: factory)
charsetStack.add(builder.charset)
seekable = false
continue
if ctx.failed and seekable:
# Retry with another charset.
inputStream.setPosition(0)
builder.document = Document(factory: factory)
continue
break
return builder.document