about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-07-25 22:10:28 +0200
committerbptato <nincsnevem662@gmail.com>2023-07-25 22:10:28 +0200
commitdcec174b2af3af816180772f948575f896d58d20 (patch)
tree72006d4d310f7f7411b9e684567786e9b634f902
parent5153b064d59b627cd2ea061bf88078cbfddfafa6 (diff)
downloadchawan-dcec174b2af3af816180772f948575f896d58d20.tar.gz
Add compileMatchRegex
This makes it so that host = 'example\.org' mandates an exact match,
but host = '^example' matches example.org, example.com, etc.
(Previously, 'example\.org' would have matched exampleexample.org
as well, which was quite counter-intuitive.)
-rw-r--r--doc/config.md42
-rw-r--r--src/config/config.nim8
-rw-r--r--src/js/regex.nim35
3 files changed, 73 insertions, 12 deletions
diff --git a/doc/config.md b/doc/config.md
index 443d7003..ccaa64ab 100644
--- a/doc/config.md
+++ b/doc/config.md
@@ -35,6 +35,8 @@ examples.
 * [Keybindings](#keybindings)
    * [Pager actions](#pager-actions)
    * [Line-editing actions](#line-editing-actions)
+* [Appendix](#appendix)
+   * [Regex handling](#regex-handling)
 
 ## Start
 
@@ -314,7 +316,9 @@ Omnirule options:
 <td>match</td>
 <td>regex</td>
 <td>Regular expression used to match the input string. Note that websites
-passed as arguments are matched as well.</td>
+passed as arguments are matched as well.<br>
+Note: regexes are handled according to the [regex handling](#regex-handling)
+rules.</td>
 </tr>
 
 <tr>
@@ -335,12 +339,12 @@ Examples:
 ```
 # Enable cookies on the orange website for log-in.
 [[siteconf]]
-url = '^https://news\.ycombinator\.com/.*'
+url = 'https://news\.ycombinator\.com/.*'
 cookie = true
 
 # Redirect npr.org to text.npr.org.
 [[siteconf]]
-host = '^(www\.)?npr\.org$'
+host = '(www\.)?npr\.org'
 rewrite-url = '''
 (x) => {
 	x.host = "text.npr.org";
@@ -351,10 +355,10 @@ rewrite-url = '''
 
 # Allow cookie sharing on *sr.ht domains.
 [[siteconf]]
-host = '^.*sr\.ht$'
+host = '.*sr\.ht'
 cookie = true
 share-cookie-jar = 'sr.ht'
-third-party-cookie = '^.*\.sr.ht$'
+third-party-cookie = '.*\.sr.ht'
 ```
 
 Siteconf options:
@@ -371,14 +375,18 @@ Siteconf options:
 <td>url</td>
 <td>regex</td>
 <td>Regular expression used to match the URL. Either this or the `host` option
-must be specified.</td>
+must be specified.<br>
+Note: regexes are handled according to the [regex handling](#regex-handling)
+rules.</td>
 </tr>
 
 <tr>
 <td>host</td>
 <td>regex</td>
 <td>Regular expression used to match the host part of the URL (i.e. domain
-name/ip address.) Either this or the `url` option must be specified.</td>
+name/ip address.) Either this or the `url` option must be specified.<br>
+Note: regexes are handled according to the [regex handling](#regex-handling)
+rules.</td>
 </tr>
 
 <tr>
@@ -399,7 +407,9 @@ false for all websites.</td>
 <td>third-party-cookie</td>
 <td>regex/array of regexes</td>
 <td>Domains for which third-party cookies are allowed on this domain. Note:
-this only works for buffers which share the same cookie jar.</td>
+this only works for buffers which share the same cookie jar.<br>
+Note: regexes are handled according to the [regex handling](#regex-handling)
+rules.</td>
 </tr>
 
 <tr>
@@ -876,3 +886,19 @@ as a word boundary.
 # Control+W deletes everything before the cursor until it reaches a space. 
 'C-w' = 'line.clearWord(x => x == " ")'
 ```
+
+## Appendix
+
+### Regex handling
+
+Regular expressions are assumed to be exact matches, except when they start
+with a caret (^) sign or end with an unescaped dollar ($) sign.
+
+In other words, the following transformations occur:
+
+```
+^abcd -> ^abcd
+efgh$ -> efgh$
+^ijkl$ -> ^ijkl$
+mnop -> ^mnop$
+```
diff --git a/src/config/config.nim b/src/config/config.nim
index b546ce3c..dfcb63c4 100644
--- a/src/config/config.nim
+++ b/src/config/config.nim
@@ -183,11 +183,11 @@ proc getSiteConfig*(config: Config, jsctx: JSContext): seq[SiteConfig] =
       images: sc.images
     )
     if sc.url.isSome:
-      conf.url = opt(compileRegex(sc.url.get, 0))
+      conf.url = opt(compileMatchRegex(sc.url.get))
     elif sc.host.isSome:
-      conf.host = opt(compileRegex(sc.host.get, 0))
+      conf.host = opt(compileMatchRegex(sc.host.get))
     for rule in sc.third_party_cookie:
-      conf.third_party_cookie.add(compileRegex(rule, 0).get)
+      conf.third_party_cookie.add(compileMatchRegex(rule).get)
     if sc.rewrite_url.isSome:
       let fun = jsctx.eval(sc.rewrite_url.get, "<siteconf>",
         JS_EVAL_TYPE_GLOBAL)
@@ -201,7 +201,7 @@ proc getSiteConfig*(config: Config, jsctx: JSContext): seq[SiteConfig] =
 
 proc getOmniRules*(config: Config, jsctx: JSContext): seq[OmniRule] =
   for rule in config.omnirule:
-    let re = compileRegex(rule.match, 0)
+    let re = compileMatchRegex(rule.match)
     var conf = OmniRule(
       match: re.get
     )
diff --git a/src/js/regex.nim b/src/js/regex.nim
index d1ec34a5..eb6ef80f 100644
--- a/src/js/regex.nim
+++ b/src/js/regex.nim
@@ -68,6 +68,41 @@ proc compileRegex*(buf: string, flags: int): Result[Regex, string] =
   regex.bytecode = bytecode
   return ok(regex)
 
+func countBackslashes(buf: string, i: int): int =
+  var j = 0
+  for i in countdown(i, 0):
+    if buf[i] != '\\':
+      break
+    inc j
+  return j
+
+# ^abcd -> ^abcd
+# efgh$ -> efgh$
+# ^ijkl$ -> ^ijkl$
+# mnop -> ^mnop$
+proc compileMatchRegex*(buf: string): Result[Regex, string] =
+  if buf.len == 0:
+    return compileRegex(buf, 0)
+  if buf[0] == '^':
+    return compileRegex(buf, 0)
+  if buf[^1] == '$':
+    # Check whether the final dollar sign is escaped.
+    if buf.len == 1 or buf[^2] != '\\':
+      return compileRegex(buf, 0)
+    let j = buf.countBackslashes(buf.high - 2)
+    if j mod 2 == 1: # odd, because we do not count the last backslash
+      return compileRegex(buf, 0)
+    # escaped. proceed as if no dollar sign was at the end
+  if buf[^1] == '\\':
+    # Check if the regex contains an invalid trailing backslash.
+    let j = buf.countBackslashes(buf.high - 1)
+    if j mod 2 != 1: # odd, because we do not count the last backslash
+      return err("unexpected end")
+  var buf2 = "^"
+  buf2 &= buf
+  buf2 &= "$"
+  return compileRegex(buf2, 0)
+
 proc compileSearchRegex*(str: string): Result[Regex, string] =
   # Parse any applicable flags in regex/<flags>. The last forward slash is
   # dropped when <flags> is empty, and interpreted as a character when the