examples/htmlrefs.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

# Example program to show the new parsexml module
# This program reads an HTML file and writes all its used links to stdout.
# Errors and whitespace are ignored.

import os, streams, parsexml, strutils

proc `=?=` (a, b: string): bool = 
  # little trick: define our own comparator that ignores case
  return cmpIgnoreCase(a, b) == 0

if paramCount() < 1: 
  quit("Usage: htmlrefs filename[.html]")

var links = 0 # count the number of links
var filename = addFileExt(ParamStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: TXmlParser
open(x, s, filename)
next(x) # get first event
block mainLoop:
  while true:
    case x.kind
    of xmlElementOpen: 
      # the <a href = "xyz"> tag we are interested in always has an attribute,
      # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
      if x.elementName =?= "a": 
        x.next()
        if x.kind == xmlAttribute: 
          if x.attrKey =?= "href":
            var link = x.attrValue
            inc(links)
            # skip until we have an ``xmlElementClose`` event
            while true: 
              x.next()
              case x.kind
              of xmlEof: break mainLoop
              of xmlElementClose: break
              else: nil
            x.next() # skip ``xmlElementClose``
            # now we have the description for the ``a`` element
            var desc = ""
            while x.kind == xmlCharData: 
              desc.add(x.charData)
              x.next()
            Echo(desc & ": " & link)
      else:
        x.next()      
    of xmlEof: break # end of file reached
    of xmlError: 
      Echo(errorMsg(x))
      x.next()
    else: x.next() # skip other events

echo($links & " link(s) found!")
x.close()