summary refs log tree commit diff stats
path: root/lib/base/regexprs.nim
blob: e2aac3d17ab8486181120687a14590eb940d849b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2009 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Regular expression support for Nimrod.
## Currently this module is implemented by providing a wrapper around the
## `PRCE (Perl-Compatible Regular Expressions) <http://www.pcre.org>`_
## C library. This means that your application will depend on the PRCE
## library's licence when using this module, which should not be a problem
## though.
## PRCE's licence follows:
##
## .. include:: ../doc/regexprs.txt
##

# This is not just a convenient wrapper for the pcre library; the
# API will stay the same if the implementation should change.

import
  pcre, strutils

type
  EInvalidRegEx* = object of EInvalidValue
    ## is raised if the pattern is no valid regular expression.

const
  MaxSubpatterns* = 10
    ## defines the maximum number of subpatterns that can be captured.
    ## More subpatterns cannot be captured!

proc match*(s, pattern: string, matches: var openarray[string],
            start: int = 0): bool
  ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
  ## the captured substrings in the array ``matches``. If it does not
  ## match, nothing is written into ``matches`` and ``false`` is
  ## returned.

proc match*(s, pattern: string, start: int = 0): bool
  ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.

proc matchLen*(s, pattern: string, matches: var openarray[string],
               start: int = 0): int
  ## the same as ``match``, but it returns the length of the match,
  ## if there is no match, -1 is returned. Note that a match length
  ## of zero can happen.

proc find*(s, pattern: string, matches: var openarray[string],
           start: int = 0): bool
  ## returns ``true`` if ``pattern`` occurs in ``s`` and the captured
  ## substrings in the array ``matches``. If it does not match, nothing
  ## is written into ``matches``.
proc find*(s, pattern: string, start: int = 0): bool
  ## returns ``true`` if ``pattern`` occurs in ``s``.


proc rawCompile(pattern: string, flags: cint): PPcre =
  var
    msg: CString
    offset: int
    com = pcreCompile(pattern, flags, addr(msg), addr(offset), nil)
  if com == nil:
    var e: ref EInvalidRegEx
    new(e)
    e.msg = $msg & "\n" & pattern & "\n" & repeatChar(offset) & "^\n"
    raise e
  return com

proc matchOrFind(s: string, pattern: PPcre, matches: var openarray[string],
                 start: cint): cint =
  var
    rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
    res = int(pcreExec(pattern, nil, s, len(s), start, 0,
      cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3))
  dealloc(pattern)
  if res < 0: return res
  for i in 0..res-1:
    var
      a = rawMatches[i * 2]
      b = rawMatches[i * 2 + 1]
    if a >= 0'i32: matches[i] = copy(s, a, int(b)-1)
    else: matches[i] = ""
  return res

proc matchOrFind(s: string, pattern: PPcre, start: cint): cint =
  var
    rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
    res = pcreExec(pattern, nil, s, len(s), start, 0,
                   cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
  dealloc(pattern)
  return res

proc match(s, pattern: string, matches: var openarray[string],
           start: int = 0): bool =
  return matchOrFind(s, rawCompile(pattern, PCRE_ANCHORED),
                     matches, start) >= 0'i32

proc matchLen(s, pattern: string, matches: var openarray[string],
              start: int = 0): int =
  return matchOrFind(s, rawCompile(pattern, PCRE_ANCHORED), matches, start)

proc find(s, pattern: string, matches: var openarray[string],
          start: int = 0): bool =
  return matchOrFind(s, rawCompile(pattern, PCRE_MULTILINE),
                     matches, start) >= 0'i32

proc match(s, pattern: string, start: int = 0): bool =
  return matchOrFind(s, rawCompile(pattern, PCRE_ANCHORED), start) >= 0'i32

proc find(s, pattern: string, start: int = 0): bool =
  return matchOrFind(s, rawCompile(pattern, PCRE_MULTILINE), start) >= 0'i32

template `=~` *(s, pattern: expr): expr = 
  ## This calls ``match`` with an implicit declared ``matches`` array that 
  ## can be used in the scope of the ``=~`` call: 
  ## 
  ## .. code-block:: nimrod
  ##
  ##   if line =~ r"\s*(\w+)\s*\=\s*(\w+)": 
  ##     # matches a key=value pair:
  ##     echo("Key: ", matches[1])
  ##     echo("Value: ", matches[2])
  ##   elif line =~ r"\s*(\#.*)":
  ##     # matches a comment
  ##     # note that the implicit ``matches`` array is different from the
  ##     # ``matches`` array of the first branch
  ##     echo("comment: ", matches[1])
  ##   else:
  ##     echo("syntax error")
  ## 
  var matches: array[0..maxSubPatterns-1, string]
  match(s, pattern, matches)
  

const ## common regular expressions
  reIdentifier* = r"\b[a-zA-Z_]+[a-zA-Z_0-9]*\b"  ## describes an identifier
  reNatural* = r"\b\d+\b" ## describes a natural number
  reInteger* = r"\b[-+]?\d+\b" ## describes an integer
  reHex* = r"\b0[xX][0-9a-fA-F]+\b" ## describes a hexadecimal number
  reBinary* = r"\b0[bB][01]+\b" ## describes a binary number (example: 0b11101)
  reOctal* = r"\b0[oO][0-7]+\b" ## describes an octal number (example: 0o777)
  reFloat* = r"\b[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\b"
    ## describes a floating point number
  reEmail* = r"\b[a-zA-Z0-9!#$%&'*+/=?^_`{|}~\-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)" &
             r"*@(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\.)+(?:[a-zA-Z]{2}|com|org|" &
             r"net|gov|mil|biz|info|mobi|name|aero|jobs|museum)\b"
    ## describes a common email address
  reURL* = r"\b(http(s)?|ftp|gopher|telnet|file|notes|ms\-help):" &
           r"((//)|(\\\\))+[\w\d:#@%/;$()~_?\+\-\=\\\.\&]*\b"
    ## describes an URL