discard """
output: '''Match failed: spam
Match failed: ham'''
joinable: false
"""
# bug #6220
import nre
import options
import strutils except isAlpha, isLower, isUpper, isSpace
from unicode import isAlpha, isLower, isUpper, isTitle, isWhiteSpace
import os
const debugLex = false
template debug(enable: bool, text: string): typed =
when enable:
echo(text)
type
Parser[N, T] = proc(text: T, start: int, nodes: var seq[Node[N]]): int {.closure.}
RuleObj[N, T] = object
parser: Parser[N, T]
kind: N
Rule[N, T] = ref RuleObj[N, T]
NodeKind = enum
terminal,
nonterminal
Node*[N] = object of RootObj
# Uncomment the following lines and the compiler crashes
# case nodeKind: NodeKind
# of nonterminal:
# kids: Node[N]
# of terminal:
# discard
start*: int
length*: int
kind*: N
NonTerminal[N] = object of Node
children: seq[Node[N]]
proc newRule[N, T](parser: Parser, kind: N): Rule[N, T] =
new(result)
result.parser = parser
result.kind = kind
proc newRule[N, T](kind: N): Rule[N, T] =
new(result)
result.kind = kind
proc initNode[N](start: int, length: int, kind: N): Node[N] =
result.start = start
result.length = length
result.kind = kind
proc initNode[N](start: int, length: int, children: seq[Node[N]], kind: N): NonTerminal[N] =
result.start = start
result.length = length
result.kind = kind
result.children = children
proc substr[T](text: T, first, last: int): T =
text[first .. last]
proc continuesWith[N](text: seq[Node[N]], subtext: seq[N], start: Natural): bool =
let length = len(text)
var pos = 0
while pos < len(subtext):
let textpos = start + pos
if textpos == len(text):
return false
if text[textpos].kind != subtext[pos].kind:
return false
pos+=1
return true
proc render*[N, T](text: T, nodes: seq[Node[N]]): string =
## Uses a sequence of Nodes to render a given text string
result = ""
for node in nodes:
result.add("<" & node.value(text) & ">")
proc render*[N, T](rule: Rule[N, T], text: string): string =
## Uses a rule to render a given text string
render(text, rule.parse(text))
proc render*[N, T](text: T, nodes: seq[Node[N]], source: string): string =
result = ""
for node in nodes:
result.add("[" & node.value(text, source) & "]")
proc render*[N, T, X](rule: Rule[N, T], text: seq[Node[X]], source: string): string =
## Uses a rule to render a given series of nodes, providing the source string
text.render(rule.parse(text, source = source), source)
proc annotate*[N, T](node: Node[N], text: T): string =
result = "<" & node.value(text) & ":" & $node.kind & ">"
proc annotate*[N, T](nodes: seq[Node[N]], text: T): string =
result = ""
for node in nodes:
result.add(node.annotate(text))
proc annotate*[N, T](rule: Rule[N, T], text: T): string =
annotate(rule.parse(text), text)
proc value*[N, T](node: Node[N], text: T): string =
result = $text.substr(node.start, node.start + node.length - 1)
proc value*[N, X](node: Node[N], text: seq[Node[X]], source: string): string =
result = ""
for n in node.start ..< node.start + node.length:
result &= text[n].annotate(source)
proc parse*[N, T](rule: Rule[N, T], text: T, start = 0, source: string = ""): seq[Node[N]] =
result = newSeq[Node[N]]()
debug(debugLex, "Parsing: " & $text)
let length = rule.parser(text, start, result)
when T is string:
if length == -1:
echo("Match failed: " & $text)
result = @[]
elif length == len(text):
debug(debugLex, "Matched: " & $text & " => " & $len(result) & " tokens: " & text.render(result))
else:
echo("Matched first " & $length & " symbols: " & $text & " => " & $len(result) & " tokens: " & text.render(result))
else:
if length == -1:
echo("Match failed: " & $text)
result = @[]
elif length == len(text):
debug(debugLex, "Matched: " & $text & " => " & $len(result) & " tokens: " & text.render(result, source))
else:
echo("Matched first " & $length & " symbols: " & $text & " => " & $len(result) & " tokens: " & text.render(result, source))
proc literal*[N, T, P](pattern: P, kind: N): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
if start == len(text):
return -1
assert(len(text)>start, "Attempting to match at $#, string length is $# " % [$start, $len(text)])
when P is string or P is seq[N]:
debug(debugLex, "Literal[" & $kind & "]: testing " & $pattern & " at " & $start & ": " & $text[start..start+len(pattern)-1])
if text.continuesWith(pattern, start):
let node = initNode(start, len(pattern), kind)
nodes.add(node)
debug(debugLex, "Literal: matched <" & $text[start ..< start+node.length] & ":" & $node.length & ">" )
return node.length
elif P is char:
debug(debugLex, "Literal[" & $kind & "]: testing " & $pattern & " at " & $start & ": " & $text[start])
if text[start] == pattern:
let node = initNode(start, 1, kind)
nodes.add(node)
return 1
else:
debug(debugLex, "Literal[" & $kind & "]: testing " & $pattern & " at " & $start & ": " & $text[start])
if text[start].kind == pattern:
let node = initNode(start, 1, kind)
nodes.add(node)
return 1
return -1
result = newRule[N, T](parser, kind)
proc token[N, T](pattern: T, kind: N): Rule[N, T] =
when T is not string:
{.fatal: "Token is only supported for strings".}
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
debug(debugLex, "Token[" & $kind & "]: testing " & pattern & " at " & $start)
if start == len(text):
return -1
assert(len(text)>start, "Attempting to match at $#, string length is $# " % [$start, $len(text)])
let m = text.match(re(pattern), start)
if m.isSome:
let node = initNode(start, len(m.get.match), kind)
nodes.add(node)
result = node.length
debug(debugLex, "Token: matched <" & text[start ..< start+node.length] & ":" & $node.length & ">" )
else:
result = -1
result = newRule[N, T](parser, kind)
proc chartest[N, T, S](testfunc: proc(s: S): bool, kind: N): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
if start == len(text):
return -1
assert(len(text)>start, "Attempting to match at $#, string length is $# " % [$start, $len(text)])
if testfunc(text[start]):
nodes.add(initNode(start, 1, kind))
result = 1
else:
result = -1
result = newRule[N, T](parser, kind)
proc any*[N, T, S](symbols: T, kind: N): Rule[N, T] =
let test = proc(s: S): bool =
when S is string:
debug(debugLex, "Any[" & $kind & "]: testing for " & symbols.replace("\n", "\\n").replace("\r", "\\r"))
else:
debug(debugLex, "Any[" & $kind & "]: testing for " & $symbols)
result = s in symbols
result = chartest[N, T, S](test, kind)
proc ignore*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
result = rule.parser(text, start, mynodes)
result = newRule[N, T](parser, rule.kind)
proc combine*[N, T](rule: Rule[N, T], kind: N): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
result = rule.parser(text, start, mynodes)
nodes.add(initNode(start, result, kind))
result = newRule[N, T](parser, kind)
proc build*[N, T](rule: Rule[N, T], kind: N): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
result = rule.parser(text, start, mynodes)
let nonTerminal = initNode(start, result, mynodes, kind)
nodes.add(nonTerminal)
result = newRule[N, T](parser, kind)
proc fail*[N, T](message: string, kind: N): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
let lineno = countLines(text[0..start])
var startline = start
var endline = start
while startline>0:
if text[startline] in NewLines:
break
startline-=1
while endline < len(text):
if text[endline] in NewLines:
break
endline+=1
let charno = start-startline
echo text.substr(startline, endline)
echo ' '.repeat(max(charno,0)) & '^'
raise newException(ValueError, "Position: " & $start & " Line: " & $lineno & ", Symbol: " & $charno & ": " & message)
result = newRule[N, T](parser, kind)
proc `+`*[N, T](left: Rule[N, T], right: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
assert(not isNil(left.parser), "Left hand side parser is nil")
let leftlength = left.parser(text, start, mynodes)
if leftlength == -1:
return leftlength
assert(not isNil(right.parser), "Right hand side parser is nil")
let rightlength = right.parser(text, start+leftlength, mynodes)
if rightlength == -1:
return rightlength
result = leftlength + rightlength
nodes.add(mynodes)
result = newRule[N, T](parser, left.kind)
proc `/`*[N, T](left: Rule[N, T], right: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
assert(not isNil(left.parser), "Left hand side of / is not fully defined")
let leftlength = left.parser(text, start, mynodes)
if leftlength != -1:
nodes.add(mynodes)
return leftlength
mynodes = newSeq[Node[N]]()
assert(not isNil(right.parser), "Right hand side of / is not fully defined")
let rightlength = right.parser(text, start, mynodes)
if rightlength == -1:
return rightlength
nodes.add(mynodes)
return rightlength
result = newRule[N, T](parser, left.kind)
proc `?`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
let success = rule.parser(text, start, nodes)
return if success != -1: success else: 0
result = newRule[N, T](parser, rule.kind)
proc `+`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var success = rule.parser(text, start, nodes)
if success == -1:
return success
var total = 0
while success != -1 and start+total < len(text):
total += success
success = rule.parser(text, start+total, nodes)
return total
result = newRule[N, T](parser, rule.kind)
proc `*`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
let success = (+rule).parser(text, start, nodes)
return if success != -1: success else: 0
result = newRule[N, T](parser, rule.kind)
#Note: this consumes - for zero-width lookahead see !
proc `^`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
let success = rule.parser(text, start, mynodes)
return if success == -1: 1 else: -1
result = newRule[N, T](parser, rule.kind)
proc `*`*[N, T](repetitions: int, rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
var total = 0
for i in 0..<repetitions:
let success = rule.parser(text, start+total, mynodes)
if success == -1:
return success
else:
total += success
nodes.add(mynodes)
return total
result = newRule[N, T](parser, rule.kind)
# Positive zero-width lookahead
proc `&`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
let success = rule.parser(text, start, mynodes)
return if success != -1: 0 else: -1
result = newRule[N, T](parser, rule.kind)
# Negative zero-width lookahead
proc `!`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
let failure = rule.parser(text, start, mynodes)
return if failure == -1: 0 else: -1
result = newRule[N, T](parser, rule.kind)
proc `/`*[N, T](rule: Rule[N, T]): Rule[N, T] =
let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
var mynodes = newSeq[Node[N]]()
var length = 0
var success = rule.parser(text, start+length, mynodes)
while success == -1 and start+length < len(text):
length += 1
success = rule.parser(text, start+length, mynodes)
if start+length >= len(text):
result = -1
else:
nodes.add(initNode(start, length, rule.kind))
nodes.add(mynodes)
result = length + success
result = newRule[N, T](parser, rule.kind)
proc `->`*(rule: Rule, production: Rule) =
assert(not isnil(production.parser), "Right hand side of -> is nil - has the rule been defined yet?")
rule.parser = production.parser
template grammar*[K](Kind, Text, Symbol: typedesc; default: K, code: untyped): typed {.hint[XDeclaredButNotUsed]: off.} =
proc newRule(): Rule[Kind, Text] {.inject.} = newRule[Kind, Text](default)
proc chartest(testfunc: proc(c: Symbol): bool): Rule[Kind, Text] {.inject.} = chartest[Kind, Text, Symbol](testfunc, default)
proc literal[P](pattern: P, kind: K): Rule[Kind, Text] {.inject.} = literal[Kind, Text, P](pattern, kind)
proc literal[P](pattern: P): Rule[Kind, Text] {.inject.} = literal[Kind, Text, P](pattern, default)
when Text is string:
proc token(pattern: string): Rule[Kind, Text] {.inject.} = token(pattern, default)
proc fail(message: string): Rule[Kind, Text] {.inject.} = fail[Kind, Text](message, default)
let alpha {.inject.} = chartest[Kind, Text, Symbol](isAlphaAscii, default)
let alphanumeric {.inject.}= chartest[Kind, Text, Symbol](isAlphaNumeric, default)
let digit {.inject.} = chartest[Kind, Text, Symbol](isDigit, default)
let lower {.inject.} = chartest[Kind, Text, Symbol](isLowerAscii, default)
let upper {.inject.} = chartest[Kind, Text, Symbol](isUpperAscii, default)
let isspace = proc (x: char): bool = x.isSpaceAscii and not (x in NewLines)
let space {.inject.} = chartest[Kind, Text, Symbol](isspace, default)
let isnewline = proc (x: char): bool = x in NewLines
let newline {.inject.} = chartest[Kind, Text, Symbol](isnewline, default)
let alphas {.inject.} = combine(+alpha, default)
let alphanumerics {.inject.} = combine(+alphanumeric, default)
let digits {.inject.} = combine(+digit, default)
let lowers {.inject.} = combine(+lower, default)
let uppers {.inject.} = combine(+upper, default)
let spaces {.inject.} = combine(+space, default)
let newlines {.inject.} = combine(+newline, default)
proc any(chars: Text): Rule[Kind, Text] {.inject.} = any[Kind, Text, Symbol](chars, default)
proc combine(rule: Rule[Kind, Text]): Rule[Kind, Text] {.inject.} = combine[Kind, Text](rule, default)
code
template grammar*[K](Kind: typedesc; default: K, code: untyped): typed {.hint[XDeclaredButNotUsed]: off.} =
grammar(Kind, string, char, default, code)
block:
type DummyKind = enum dkDefault
grammar(DummyKind, string, char, dkDefault):
let rule = token("h[a]+m") + ignore(token(r"\s+")) + (literal("eggs") / literal("beans"))
var text = "ham beans"
discard rule.parse(text)
var recursive = newRule()
recursive -> (literal("(") + recursive + literal(")")) / token(r"\d+")
for test in ["spam", "57", "(25)", "((25))"]:
discard recursive.parse(test)
let repeated = +literal("spam") + ?literal("ham") + *literal("salami")
for test in ["ham", "spam", "spamspamspam" , "spamham", "spamsalami", "spamsalamisalami"]:
discard repeated.parse(test)