Improved incremental search: support unicode

author: bptato <nincsnevem662@gmail.com> 2022-07-23 16:54:02 +0200
committer: bptato <nincsnevem662@gmail.com> 2022-07-23 17:39:06 +0200
commit: e7b53775a3f52cb5d8da865213c5dc38b954e33c (patch)
tree: 200741f8ed965f407c1a6af8823705db133bf3d4
parent: c7f25b2fe470849e028f9502d3da0851f149f065 (diff)
download: chawan-e7b53775a3f52cb5d8da865213c5dc38b954e33c.tar.gz
7 files changed, 189 insertions, 59 deletions
diff --git a/src/bindings/libregexp.nim b/src/bindings/libregexp.nim
index 1b84400e..e0a05543 100644
--- a/src/bindings/libregexp.nim
+++ b/src/bindings/libregexp.nim
@@ -17,3 +17,5 @@ proc lre_exec*(capture: ptr ptr uint8, bc_buf: ptr uint8, cbuf: ptr uint8,
                opaque: pointer): cint {.importc: "lre_exec", header: lreheader.}
 
 proc lre_get_capture_count*(bc_buf: ptr uint8): cint {.importc: "lre_get_capture_count", header: lreheader.}
+
+proc lre_get_flags*(bc_buf: ptr uint8): cint {.importc: "lre_get_flags", header: lreheader.}
diff --git a/src/client.nim b/src/client.nim
index d8f6e2e7..f16e6883 100644
--- a/src/client.nim
+++ b/src/client.nim
@@ -7,6 +7,7 @@ import unicode
 import css/sheet
 import config/config
 import io/buffer
+import io/cell
 import io/lineedit
 import io/loader
 import js/javascript
@@ -304,25 +305,32 @@ proc isearch(client: Client) =
   client.statusMode()
   var iput: string
   let cpos = client.buffer.cpos
+  var mark: Mark
+  var my: int
+  template del_mark() =
+    if mark != nil:
+      client.buffer.removeMark(my, mark)
+
   let status = readLine("/", iput, client.buffer.width, {}, (proc(state: var LineState): bool =
-    client.buffer.marks.setLen(0)
+    del_mark
     let regex = compileSearchRegex($state.news)
     client.buffer.cpos = cpos
     if regex.issome:
       let match = client.buffer.cursorNextMatch(regex.get)
       if match.success:
-        client.buffer.addMark(match.x, match.y, match.str)
-        let nos = client.buffer.nostatus
+        mark = client.buffer.addMark(match.x, match.y, match.str.width())
         client.buffer.redraw = true
         client.buffer.refreshBuffer(true)
         print(HVP(client.buffer.height + 1, 2))
       else:
-        client.buffer.marks.setLen(0)
+        del_mark
       return true
     false
   ))
-  client.buffer.marks.setLen(0)
+
+  del_mark
   client.buffer.redraw = true
+  client.buffer.refreshBuffer(true)
   if status:
     client.regex = compileSearchRegex(iput)
   else:
@@ -332,22 +340,29 @@ proc isearchBack(client: Client) =
   client.statusMode()
   var iput: string
   let cpos = client.buffer.cpos
+  var mark: Mark
+  var my: int
+  template del_mark() =
+    if mark != nil:
+      client.buffer.removeMark(my, mark)
   let status = readLine("?", iput, client.buffer.width, {}, (proc(state: var LineState): bool =
-    client.buffer.marks.setLen(0)
+    del_mark
     let regex = compileSearchRegex($state.news)
     client.buffer.cpos = cpos
     if regex.issome:
       let match = client.buffer.cursorPrevMatch(regex.get)
       if match.success:
-        client.buffer.addMark(match.x, match.y, match.str)
-        let nos = client.buffer.nostatus
+        mark = client.buffer.addMark(match.x, match.y, match.str.width())
+        my = match.y
         client.buffer.redraw = true
         client.buffer.refreshBuffer(true)
         print(HVP(client.buffer.height + 1, 2))
+      else:
+        del_mark
       return true
     false
   ))
-  client.buffer.marks.setLen(0)
+  del_mark
   client.buffer.redraw = true
   if status:
     client.regex = compileSearchRegex(iput)
diff --git a/src/io/buffer.nim b/src/io/buffer.nim
index 065a85a8..b3022615 100644
--- a/src/io/buffer.nim
+++ b/src/io/buffer.nim
@@ -38,12 +38,6 @@ type
     y*: int
     str*: string
 
-  Mark* = object
-    x: int
-    y: int
-    format: Format
-    grid: FixedGrid
-
   Buffer* = ref object
     contenttype*: string
     title*: string
@@ -73,7 +67,6 @@ type
     next*: Buffer
     userstyle*: CSSStylesheet
     loader*: FileLoader
-    marks*: seq[Mark]
 
 proc newBuffer*(): Buffer =
   new(result)
@@ -195,14 +188,6 @@ func generateStatusMessage*(buffer: Buffer): string =
   if w < buffer.width:
     result &= EL()
 
-func generateMark*(buffer: Buffer, mark: Mark): string =
-  var format = newFormat()
-  var w = 0
-  for cell in mark.grid:
-    result &= format.processFormat(cell.format)
-    result &= $cell.runes
-    w += cell.width()
-
 func numLines(buffer: Buffer): int = buffer.lines.len
 
 func lastVisibleLine(buffer: Buffer): int = min(buffer.fromy + buffer.height, buffer.numLines)
@@ -332,27 +317,36 @@ proc refreshDisplay(buffer: Buffer) =
 
   for line in buffer.lines[buffer.fromy..
                            buffer.lastVisibleLine - 1]:
-    var w = 0
-    var i = 0
+    var w = 0 # width of the row so far
+    var i = 0 # byte in line.str
+
+    # Skip cells till buffer.fromx.
     while w < buffer.fromx and i < line.str.len:
       fastRuneAt(line.str, i, r)
       w += r.width()
 
-    let dls = y * buffer.width
+    let dls = y * buffer.width # starting position of row in display
+
+    # Fill in the gap in case we skipped more cells than fromx mandates (i.e.
+    # we encountered a double-width character.)
     var k = 0
-    var cf = line.findFormat(w)
-    var nf = line.findNextFormat(w)
     if w > buffer.fromx:
       while k < w - buffer.fromx:
         buffer.display[dls + k].runes.add(Rune(' '))
         inc k
 
+    var cf = line.findFormat(w)
+    var nf = line.findNextFormat(w)
+
+    let startw = w # save this for later
+
+    # Now fill in the visible part of the row.
     while i < line.str.len:
       let pw = w
       fastRuneAt(line.str, i, r)
       w += r.width()
       if w > buffer.fromx + buffer.width:
-        break
+        break # die on exceeding the width limit
       if nf.pos != -1 and nf.pos <= pw:
         cf = nf
         nf = line.findNextFormat(pw)
@@ -364,6 +358,14 @@ proc refreshDisplay(buffer: Buffer) =
       while k < tk and k < buffer.width - 1:
         inc k
 
+    # Then, for each cell that has a mark, override its formatting with that
+    # specified by the mark.
+    let aw = buffer.width - (startw - buffer.fromx) # actual width
+    for mark in line.marks:
+      if mark.x >= startw + aw or mark.x + mark.width < startw: continue
+      for i in max(mark.x, startw)..<min(mark.x + mark.width, startw + aw):
+        buffer.display[dls + i].format = mark.format
+
     inc y
 
 proc setCursorX(buffer: Buffer, x: int, refresh = true, save = true) =
@@ -698,19 +700,17 @@ proc gotoAnchor*(buffer: Buffer) =
         return
       inc i
 
-proc addMark*(buffer: Buffer, x, y: int, str: string) =
+proc addMark*(buffer: Buffer, x, y, width: int): Mark =
   assert y < buffer.lines.len
   var format = newFormat()
   format.reverse = true
-  #TODO get rid of the string part; marks should only consist of a position and
-  # a length.
-  var grid = newFixedGrid(str.width())
-  var i = 0
-  for r in str.runes:
-    grid[i].runes.add(r)
-    grid[i].format = format
-    i += r.width()
-  buffer.marks.add(Mark(x: x, y: y, format: format, grid: grid))
+  result = Mark(x: x, width: width, format: format)
+  buffer.lines[y].marks.add(result)
+
+proc removeMark*(buffer: Buffer, y: int, mark: Mark) =
+  let i = buffer.lines[y].marks.find(mark)
+  if i != -1:
+    buffer.lines[y].marks.delete(i)
 
 proc cursorNextMatch(buffer: Buffer, regex: Regex, sy, ey: int, wrap = false): BufferMatch =
   for y in sy..ey:
@@ -721,10 +721,11 @@ proc cursorNextMatch(buffer: Buffer, regex: Regex, sy, ey: int, wrap = false): B
     let res = regex.exec(buffer.lines[y].str, s)
     if res.success and res.captures.len > 0:
       let cap = res.captures[0]
-      buffer.setCursorXY(cap.s, y)
+      let x = buffer.lines[y].str.width(cap.s)
+      buffer.setCursorXY(x, y)
       result.success = true
       result.y = y
-      result.x = buffer.cursorBytes(y, cap.s)
+      result.x = x
       result.str = buffer.lines[y].str.substr(cap.s, cap.e - 1)
       return
 
@@ -752,10 +753,11 @@ proc cursorPrevMatch*(buffer: Buffer, regex: Regex, sy, ey: int, wrap = false):
       for i in countdown(res.captures.high, 0):
         let cap = res.captures[i]
         if cap.s < e:
-          buffer.setCursorXY(cap.s, y)
+          let x = buffer.lines[y].str.width(cap.s)
+          buffer.setCursorXY(x, y)
           result.success = true
           result.y = y
-          result.x = buffer.cursorBytes(y, cap.s)
+          result.x = x
           result.str = buffer.lines[y].str.substr(cap.s, cap.e - 1)
           return
 
@@ -1214,13 +1216,6 @@ proc refreshBuffer*(buffer: Buffer, peek = false) =
     buffer.refreshDisplay()
     buffer.displayBufferSwapOutput()
 
-  for mark in buffer.marks:
-    if mark.y in buffer.fromy..(buffer.fromy + buffer.height) and mark.x in buffer.fromx..(buffer.fromx + buffer.width):
-      print(HVP(mark.y - buffer.fromy + 1, mark.x - buffer.fromx + 1))
-      print(SGR())
-      print(buffer.generateMark(mark))
-      print(SGR())
-
   if not peek:
     if not buffer.nostatus:
       buffer.statusMsgForBuffer()
diff --git a/src/io/cell.nim b/src/io/cell.nim
index abcb8f92..4d808dda 100644
--- a/src/io/cell.nim
+++ b/src/io/cell.nim
@@ -35,6 +35,12 @@ type
   FlexibleLine* = object
     str*: string
     formats*: seq[FormatCell]
+    marks*: seq[Mark]
+
+  Mark* = ref object
+    x*: int
+    width*: int
+    format*: Format
 
   FlexibleGrid* = seq[FlexibleLine]
 
diff --git a/src/js/regex.nim b/src/js/regex.nim
index a1bd7d35..75ff7535 100644
--- a/src/js/regex.nim
+++ b/src/js/regex.nim
@@ -1,10 +1,12 @@
 # Interface for QuickJS libregexp.
 
 import options
+import unicode
 
 import bindings/libregexp
 import bindings/quickjs
 import js/javascript
+import strings/charset
 
 export
   LRE_FLAG_GLOBAL,
@@ -36,8 +38,10 @@ proc compileRegex*(buf: string, flags: int): Option[Regex] =
   var error_msg_size = 64
   var error_msg = cast[cstring](alloc0(error_msg_size))
   let bytecode = lre_compile(addr len, error_msg, cint(error_msg_size), cstring(buf), csize_t(buf.len), cint(flags), dummyContext)
+
   if error_msg != nil:
     #TODO error handling?
+    #eprint "err", error_msg
     dealloc(error_msg)
     error_msg = nil
   if bytecode == nil:
@@ -55,10 +59,9 @@ proc compileSearchRegex*(str: string): Option[Regex] =
   while i >= 0:
     case str[i]
     of '/':
-      if i > 0 and str[i - 1] == '\\': break # escaped
       flagsi = i
       break
-    of 'i', 'm', 's': discard
+    of 'i', 'm', 's', 'u': discard
     else: break # invalid flag
     dec i
 
@@ -73,20 +76,36 @@ proc compileSearchRegex*(str: string): Option[Regex] =
     of 'i': flags = flags or LRE_FLAG_IGNORECASE
     of 'm': flags = flags or LRE_FLAG_MULTILINE
     of 's': flags = flags or LRE_FLAG_DOTALL
+    of 'u': flags = flags or LRE_FLAG_UTF16
     else: assert false
   return compileRegex(str.substr(0, flagsi - 1), flags)
 
 proc exec*(regex: Regex, str: string, start = 0): RegexResult =
   assert 0 <= start and start <= str.len
-  let cstr = cstring(str)
-  let captureCount = lre_get_capture_count(cast[ptr uint8](regex.bytecode))
+
+  let captureCount = lre_get_capture_count(regex.bytecode)
+
   var capture: ptr ptr uint8 = nil
   if captureCount > 0:
     capture = cast[ptr ptr uint8](alloc0(sizeof(ptr uint8) * captureCount * 2))
+
+  var cstr = cstring(str)
+  var ascii = true
+  for c in str:
+    if c > char(0x80):
+      ascii = false
+      break
+  var ustr: string16
+  if not ascii:
+    ustr = toUTF16(str)
+    cstr = cstring(ustr)
+
   let ret = lre_exec(capture, regex.bytecode,
                      cast[ptr uint8](cstr), cint(start),
-                     cint(str.len), cint(0), dummyContext)
+                     cint(str.len), cint(not ascii), dummyContext)
+
   result.success = ret == 1 #TODO error handling? (-1)
+
   if result.success:
     var i = 0
     let cstrAddress = cast[int](cstr)
@@ -99,7 +118,22 @@ proc exec*(regex: Regex, str: string, start = 0): RegexResult =
       let endPointer = cast[ptr ptr uint8](endPointerAddress)
       let startAddress = cast[int](startPointer[])
       let endAddress = cast[int](endPointer[])
-      let s = startAddress - cstrAddress
-      let e = endAddress - cstrAddress
-      result.captures.add((s, e))
+      var s = startAddress - cstrAddress
+      var e = endAddress - cstrAddress
+      if ascii:
+        result.captures.add((s, e))
+      else:
+        var s8 = 0
+        var e8 = 0
+        var i = 0
+        var r: Rune
+        while i < s:
+          fastRuneAt(ustr, i, r)
+          let si = r.size()
+          s8 += si
+          e8 += si
+        while i < e:
+          fastRuneAt(ustr, i, r)
+          e8 += r.size()
+        result.captures.add((s8, e8))
   dealloc(capture)
diff --git a/src/strings/charset.nim b/src/strings/charset.nim
new file mode 100644
index 00000000..8726268a
--- /dev/null
+++ b/src/strings/charset.nim
@@ -0,0 +1,69 @@
+import unicode
+
+type string16* = distinct string
+
+# Convert a UTF-8 string to UTF-16.
+# Note: this doesn't check for (invalid) UTF-8 containing surrogates.
+proc toUTF16*(s: string): string16 =
+  var res = ""
+  var i = 0
+  template put16(c: uint16) =
+    res.setLen(res.len + 2)
+    res[i] = cast[char](c)
+    inc i
+    res[i] = cast[char](c shr 8)
+    inc i
+  for r in s.runes:
+    var c = uint32(r)
+    if c < 0x10000: # ucs-2
+      put16 uint16(c)
+    elif c <= 0x10FFFF: # surrogate
+      c -= 0x10000
+      put16 uint16((c shr 10) + 0xD800)
+      put16 uint16((c and 0x3FF) + 0xDC00)
+    else: # invalid
+      put16 uint16(0xFFFD)
+  result = string16(res)
+
+proc len*(s: string16): int {.borrow.}
+proc `[]`*(s: string16, i: int): char = string(s)[i]
+proc `[]`*(s: string16, i: BackwardsIndex): char = string(s)[i]
+
+template fastRuneAt*(s: string16, i: int, r: untyped, doInc = true, be = false) =
+  if i + 1 == s.len: # unmatched byte
+    when doInc: inc i
+    r = Rune(0xFFFD)
+  else:
+    when be:
+      var c1: uint32 = (uint32(s[i]) shl 8) + uint32(s[i + 1])
+    else:
+      var c1: uint32 = uint32(s[i]) + (uint32(s[i + 1]) shl 8)
+    if c1 >= 0xD800 or c1 < 0xDC00:
+      if i + 3 == s.len:
+        when doInc: i += 2
+        r = Rune(c1) # unmatched surrogate
+      else:
+        when be:
+          var c2: uint32 = (uint32(s[i + 2]) shl 8) + uint32(s[i + 3])
+        else:
+          var c2: uint32 = uint32(s[i + 2]) + (uint32(s[i + 3]) shl 8)
+        if c2 >= 0xDC00 and c2 < 0xE000:
+          r = Rune((((c1 and 0x3FF) shl 10) or (c2 and 0x3FF)) + 0x10000)
+          when doInc: i += 4
+        else:
+          r = Rune(c1) # unmatched surrogate
+          when doInc: i += 2
+    else:
+      r = Rune(c1) # ucs-2
+      when doInc: i += 2
+
+iterator runes*(s: string16): Rune =
+  var i = 0
+  var r: Rune
+  while i < s.len:
+    fastRuneAt(s, i, r)
+    yield r
+
+proc fromUTF16*(s: string16): string =
+  for r in s.runes:
+    result &= r
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index 4937d0be..c88c5980 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -867,6 +867,15 @@ func width*(s: string): int =
   for r in s.runes():
     result += width(r)
 
+func width*(s: string, len: int): int =
+  var i = 0
+  var m = len
+  if m > s.len: m = s.len
+  while i < m:
+    var r: Rune
+    fastRuneAt(s, i, r)
+    result += width(r)
+
 func width*(s: seq[Rune]): int =
   for r in s:
     result += width(r)
author	bptato <nincsnevem662@gmail.com>	2022-07-23 16:54:02 +0200
committer	bptato <nincsnevem662@gmail.com>	2022-07-23 17:39:06 +0200
commit	e7b53775a3f52cb5d8da865213c5dc38b954e33c (patch)
tree	200741f8ed965f407c1a6af8823705db133bf3d4
parent	c7f25b2fe470849e028f9502d3da0851f149f065 (diff)
download	chawan-e7b53775a3f52cb5d8da865213c5dc38b954e33c.tar.gz