about summary refs log blame commit diff stats
path: root/src/utils/twtstr.nim
blob: b1c6d1c973af852a4e3b54d2005bf383a0c68392 (plain) (tree)
1
2
3
4
5
6
7
8
9
                    



                   



                    
 
                



                         
 
                                       
                                           
 
                                    
                                         
 
                                     


                              

                                       


                             
 


                                         
                         
            

                          
 





                                             















                                           
 





                                                   
                                                                      

         
                              






                                  

                              


                            
 

                                        
                                          


                                 
                                         
                       















                                     
 


                                                         













                                                    
















                                              
                                             
             
                                                            
              
 

                                                          
                            
                 

                    
 

                                                               




                                   
                                                     
                      
 
                                              





                            
                                                      
 
                                                         







                                
                                                                        
 
                                                          







                                
                                                                          
 

                                                     





                                                                              
                                      









                                                                             
                                                     
                                                

                   
           






                              


                      

                                                                            

 
                                



                                  

                                   
                                                 
 

                                                      
 




                                      
                            





                     































                         


















                       
                                                                                             

                                                                             
                 





                                 
                
                  




                              

                                                
                    
                
         
                             
 


                                                           

                                            
 

                                            
 







                                                                         
           


                                 






                              
                                                
                               

                    

                      
 



                                                                          








                                                                      





                                                                         
                                              
                     
                                        
                 




                          

                               
                


                                 
                                         


                                      

                               
                                           



                                  






                                                
                                           


                                  
                                                                                
 















                                                                          
     

                                                                      
 
                                                                
                                     
                              
                 
                   


                 
                     
 
                                                                  
                                     
             
                                             
 
                                                                         
                
                                           
 
                                                                             
                                           
 
                                                    
            
                      
                    
                                      

                 







                                        
 
                                     
             
             






                               
 






                                   
                                 
                                                      










                            
                                                                         

                         























                                                                  
                








                                       
                                                    








                                    


                                                         

             
                                             





                  
                   

                           




                                
                                
            
                   
                                   



                     
                                        











                                               
               
 
                                                        

                 

                    




                                    
              


                           
              


           


                                          
                             




                                    





















                                                                                                               















                                                   
 















                                                   
                                                                 










                                                   
            









                                                        
          






























































                                                                              
import std/algorithm
import std/math
import std/options
import std/os
import std/strutils
import std/unicode

when defined(posix):
  import std/posix

import types/opt
import utils/charcategory
import utils/map

export charcategory

func onlyWhitespace*(s: string): bool =
  return AllChars - AsciiWhitespace notin s

func isControlChar*(r: Rune): bool =
  return int(r) <= 0x1F or int(r) == 0x7F

func getControlChar*(c: char): char =
  if c == '?':
    return char(127)
  return char(int(c) and 0x1F)

func getControlLetter*(c: char): char =
  if c == char(127):
    return '?'
  return char(int(c) or 0x40)

func toHeaderCase*(str: string): string =
  result = str
  var flip = true
  for c in result.mitems:
    if flip:
      c = c.toUpperAscii()
    flip = c == '-'

func snakeToKebabCase*(str: string): string =
  result = str
  for c in result.mitems:
    if c == '_':
      c = '-'

func kebabToCamelCase*(s: string): string =
  result = s
  var flip = false
  for c in result.mitems:
    if flip:
      c = c.toUpperAscii()
    flip = c == '-'

func camelToKebabCase*(s: string): string =
  result = ""
  for c in s:
    if c in AsciiUpperAlpha:
      result &= '-'
      result &= c.toLowerAscii()
    else:
      result &= c

func startsWithNoCase*(str, prefix: string): bool =
  if str.len < prefix.len: return false
  # prefix.len is always lower
  var i = 0
  while true:
    if i == prefix.len: return true
    if str[i].toLowerAscii() != prefix[i].toLowerAscii(): return false
    inc i

func hexValue*(c: char): int =
  if c in AsciiDigit:
    return int(c) - int('0')
  if c in 'a'..'f':
    return int(c) - int('a') + 0xA
  if c in 'A'..'F':
    return int(c) - int('A') + 0xA
  return -1

func decValue*(c: char): int =
  if c in AsciiDigit:
    return int(c) - int('0')
  return -1

const HexCharsUpper = "0123456789ABCDEF"
const HexCharsLower = "0123456789abcdef"
func pushHex*(buf: var string; u: uint8) =
  buf &= HexCharsUpper[u shr 4]
  buf &= HexCharsUpper[u and 0xF]

func pushHex*(buf: var string; c: char) =
  buf.pushHex(uint8(c))

func toHexLower*(u: uint16): string =
  var x = u
  let len = if (u and 0xF000) != 0:
    4
  elif (u and 0x0F00) != 0:
    3
  elif (u and 0xF0) != 0:
    2
  else:
    1
  var s = newString(len)
  for i in countdown(len - 1, 0):
    s[i] = HexCharsLower[x and 0xF]
    x = x shr 4
  return s

func equalsIgnoreCase*(s1, s2: string): bool {.inline.} =
  return s1.cmpIgnoreCase(s2) == 0

func startsWithIgnoreCase*(s1, s2: string): bool =
  if s1.len < s2.len: return false
  for i in 0 ..< s2.len:
    if s1[i].toLowerAscii() != s2[i].toLowerAscii():
      return false
  return true

func endsWithIgnoreCase*(s1, s2: string): bool =
  if s1.len < s2.len: return false
  for i in countdown(s2.high, 0):
    if s1[i].toLowerAscii() != s2[i].toLowerAscii():
      return false
  return true

func stripAndCollapse*(s: string): string =
  var i = 0
  while i < s.len and s[i] in AsciiWhitespace:
    inc i
  var space = false
  while i < s.len:
    if s[i] notin AsciiWhitespace:
      if space:
        result &= ' '
        space = false
      result &= s[i]
    elif not space:
      space = true
    else:
      result &= ' '
    inc i

func skipBlanks*(buf: string; at: int): int =
  result = at
  while result < buf.len and buf[result] in AsciiWhitespace:
    inc result

func until*(s: string; c: set[char]; starti = 0): string =
  result = ""
  for i in starti ..< s.len:
    if s[i] in c:
      break
    result.add(s[i])

func untilLower*(s: string; c: set[char]; starti = 0): string =
  result = ""
  for i in starti ..< s.len:
    if s[i] in c:
      break
    result.add(s[i].toLowerAscii())

func until*(s: string; c: char; starti = 0): string =
  s.until({c}, starti)

func after*(s: string; c: set[char]): string =
  var i = 0
  while i < s.len:
    if s[i] in c:
      return s.substr(i + 1)
    inc i

func after*(s: string; c: char): string = s.after({c})

func afterLast*(s: string; c: set[char]; n = 1): string =
  var j = 0
  for i in countdown(s.high, 0):
    if s[i] in c:
      inc j
      if j == n:
        return s.substr(i + 1)
  return s

func afterLast*(s: string; c: char; n = 1): string = s.afterLast({c}, n)

func beforeLast*(s: string; c: set[char]; n = 1): string =
  var j = 0
  for i in countdown(s.high, 0):
    if s[i] in c:
      inc j
      if j == n:
        return s.substr(0, i)
  return s

func beforeLast*(s: string; c: char; n = 1): string = s.beforeLast({c}, n)

proc c_sprintf(buf, fm: cstring): cint
  {.header: "<stdio.h>", importc: "sprintf", varargs}

# From w3m
const SizeUnit = [
  cstring"b", cstring"kb", cstring"Mb", cstring"Gb", cstring"Tb", cstring"Pb",
  cstring"Eb", cstring"Zb", cstring"Bb", cstring"Yb"
]
func convertSize*(size: int): string =
  var sizepos = 0
  var csize = float32(size)
  while csize >= 999.495 and sizepos < SizeUnit.len:
    csize = csize / 1024.0
    inc sizepos
  result = newString(10)
  let f = floor(csize * 100 + 0.5) / 100
  discard c_sprintf(cstring(result), cstring("%.3g%s"), f, SizeUnit[sizepos])
  result.setLen(cstring(result).len)

func numberAdditive*(i: int; range: HSlice[int, int];
    symbols: openArray[(int, string)]): string =
  if i notin range:
    return $i
  var n = i
  var at = 0
  while n > 0:
    if n >= symbols[at][0]:
      n -= symbols[at][0]
      result &= symbols[at][1]
      continue
    inc at
  return result

const romanNumbers = [
  (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"),
  (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I")
]

const romanNumbersLower = block:
  var res: seq[(int, string)]
  for (n, s) in romanNumbers:
    res.add((n, s.toLowerAscii()))
  res

func romanNumber*(i: int): string =
  return numberAdditive(i, 1..3999, romanNumbers)

func romanNumberLower*(i: int): string =
  return numberAdditive(i, 1..3999, romanNumbersLower)

func japaneseNumber*(i: int): string =
  if i == 0:
    return "〇"
  var n = i
  if i < 0:
    result &= "マイナス"
    n *= -1

  let o = n

  var ss: seq[string]
  var d = 0
  while n > 0:
    let m = n mod 10

    if m != 0:
      case d
      of 1: ss.add("十")
      of 2: ss.add("百")
      of 3: ss.add("千")
      of 4:
        ss.add("万")
        ss.add("一")
      of 5:
        ss.add("万")
        ss.add("十")
      of 6:
        ss.add("万")
        ss.add("百")
      of 7:
        ss.add("万")
        ss.add("千")
        ss.add("一")
      of 8:
        ss.add("億")
        ss.add("一")
      of 9:
        ss.add("億")
        ss.add("十")
      else: discard
    case m
    of 0:
      inc d
      n = n div 10
    of 1:
      if o == n:
        ss.add("一")
    of 2: ss.add("二")
    of 3: ss.add("三")
    of 4: ss.add("四")
    of 5: ss.add("五")
    of 6: ss.add("六")
    of 7: ss.add("七")
    of 8: ss.add("八")
    of 9: ss.add("九")
    else: discard
    n -= m

  n = ss.len - 1
  while n >= 0:
    result &= ss[n]
    dec n

# Implements https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#signed-integers
func parseIntImpl[T: SomeSignedInt](s: string; allowed: set[char]; radix: T):
    Option[T] =
  var sign: T = 1
  var i = 0
  if i < s.len and s[i] == '-':
    sign = -1
    inc i
  elif i < s.len and s[i] == '+':
    inc i
  if i == s.len:
    return none(T)
  var integer: T = 0
  while i < s.len:
    if s[i] notin allowed:
      return none(T) # invalid
    let c = T(hexValue(s[i]))
    if unlikely(T.high div radix - c < integer):
      return none(T) # overflow
    integer *= radix
    integer += c
    inc i
  return some(sign * integer)

func parseIntImpl[T: SomeSignedInt](s: string): Option[T] =
  return parseIntImpl[T](s, AsciiDigit, 10)

func parseInt32*(s: string): Option[int32] =
  return parseIntImpl[int32](s)

func parseInt64*(s: string): Option[int64] =
  return parseIntImpl[int64](s)

func parseOctInt64*(s: string): Option[int64] =
  return parseIntImpl[int64](s, AsciiOctDigit, 8)

func parseHexInt64*(s: string): Option[int64] =
  return parseIntImpl[int64](s, AsciiHexDigit, 16)

func parseUIntImpl[T: SomeUnsignedInt](s: string; allowSign: static bool;
    allowed: set[char]; radix: T): Option[T] =
  var i = 0
  when allowSign:
    if i < s.len and s[i] == '+':
      inc i
  if i == s.len:
    return none(T)
  var integer: T = 0
  while i < s.len:
    if s[i] notin allowed:
      return none(T) # invalid
    let c = T(hexValue(s[i]))
    if unlikely(T.high div radix - c < integer):
      return none(T) # overflow
    integer *= radix
    integer += c
    inc i
  return some(integer)

func parseUIntImpl[T: SomeUnsignedInt](s: string; allowSign: static bool):
    Option[T] =
  return parseUIntImpl[T](s, allowSign, AsciiDigit, 10)

func parseUInt8*(s: string; allowSign: static bool): Option[uint8] =
  return parseUIntImpl[uint8](s, allowSign)

func parseUInt16*(s: string; allowSign: static bool): Option[uint16] =
  return parseUIntImpl[uint16](s, allowSign)

func parseUInt32*(s: string; allowSign: static bool): Option[uint32] =
  return parseUIntImpl[uint32](s, allowSign)

func parseOctUInt32*(s: string; allowSign: static bool): Option[uint32] =
  return parseUIntImpl[uint32](s, allowSign, AsciiOctDigit, 8)

func parseHexUInt32*(s: string; allowSign: static bool): Option[uint32] =
  return parseUIntImpl[uint32](s, allowSign, AsciiHexDigit, 16)

#TODO not sure where this algorithm is from...
# (probably from CSS)
func parseFloat64*(s: string): float64 =
  var sign = 1f64
  var t = 1
  var d = 0
  var integer: float64 = 0
  var f: float64 = 0
  var e: float64 = 0
  var i = 0
  if i < s.len and s[i] == '-':
    sign = -1f64
    inc i
  elif i < s.len and s[i] == '+':
    inc i
  while i < s.len and s[i] in AsciiDigit:
    integer *= 10
    integer += float64(decValue(s[i]))
    inc i
  if i < s.len and s[i] == '.':
    inc i
    while i < s.len and s[i] in AsciiDigit:
      f *= 10
      f += float64(decValue(s[i]))
      inc i
      inc d
  if i < s.len and (s[i] == 'e' or s[i] == 'E'):
    inc i
    if i < s.len and s[i] == '-':
      t = -1
      inc i
    elif i < s.len and s[i] == '+':
      inc i
    while i < s.len and s[i] in AsciiDigit:
      e *= 10
      e += float64(decValue(s[i]))
      inc i
  return sign * (integer + f * pow(10, float64(-d))) * pow(10, (float64(t) * e))

const ControlPercentEncodeSet* = Controls + NonAscii
const FragmentPercentEncodeSet* = ControlPercentEncodeSet +
  {' ', '"', '<', '>', '`'}
const QueryPercentEncodeSet* = FragmentPercentEncodeSet - {'`'} + {'#'}
const SpecialQueryPercentEncodeSet* = QueryPercentEncodeSet + {'\''}
const PathPercentEncodeSet* = QueryPercentEncodeSet + {'?', '`', '{', '}'}
const UserInfoPercentEncodeSet* = PathPercentEncodeSet +
  {'/', ':', ';', '=', '@', '['..'^', '|'}
const ComponentPercentEncodeSet* = UserInfoPercentEncodeSet +
  {'$'..'&', '+', ','}
const ApplicationXWWWFormUrlEncodedSet* = ComponentPercentEncodeSet +
  {'!', '\''..')', '~'}
# used by pager
when DirSep == '\\':
  const LocalPathPercentEncodeSet* = Ascii - AsciiAlpha - AsciiDigit -
    {'.', '\\', '/'}
else:
  const LocalPathPercentEncodeSet* = Ascii - AsciiAlpha - AsciiDigit -
    {'.', '/'}

proc percentEncode*(append: var string; c: char; set: set[char];
    spaceAsPlus = false) {.inline.} =
  if spaceAsPlus and c == ' ':
    append &= '+'
  elif c notin set:
    append &= c
  else:
    append &= '%'
    append.pushHex(c)

proc percentEncode*(append: var string; s: string; set: set[char];
    spaceAsPlus = false) {.inline.} =
  for c in s:
    append.percentEncode(c, set, spaceAsPlus)

func percentEncode*(c: char; set: set[char]; spaceAsPlus = false): string
    {.inline.} =
  result.percentEncode(c, set, spaceAsPlus)

func percentEncode*(s: string; set: set[char]; spaceAsPlus = false): string =
  result.percentEncode(s, set, spaceAsPlus)

func percentDecode*(input: string; si = 0): string =
  var i = si
  while i < input.len:
    let c = input[i]
    if c != '%' or i + 2 >= input.len:
      result &= c
    else:
      let h1 = input[i + 1].hexValue
      let h2 = input[i + 2].hexValue
      if h1 == -1 or h2 == -1:
        result &= c
      else:
        result &= char((h1 shl 4) or h2)
        i += 2
    inc i

func htmlEscape*(s: string): string =
  result = ""
  for c in s:
    case c
    of '<': result &= "&lt;"
    of '>': result &= "&gt;"
    of '&': result &= "&amp;"
    of '"': result &= "&quot;"
    of '\'': result &= "&apos;"
    else: result &= c

func dqEscape*(s: string): string =
  result = newStringOfCap(s.len)
  for c in s:
    if c == '"':
      result &= '\\'
    result &= c

#basically std join but with char
func join*(ss: openArray[string]; sep: char): string =
  if ss.len == 0:
    return ""
  var n = ss.high - 1
  for i in 0..high(ss):
    n += ss[i].len
  result = newStringOfCap(n)
  result &= ss[0]
  for i in 1..high(ss):
    result &= sep
    result &= ss[i]

proc passRealloc*(opaque, p: pointer; size: csize_t): pointer {.cdecl.} =
  return realloc(p, size)

# https://www.w3.org/TR/xml/#NT-Name
const NameStartCharRanges = [
  (0xC0, 0xD6),
  (0xD8, 0xF6),
  (0xF8, 0x2FF),
  (0x370, 0x37D),
  (0x37F, 0x1FFF),
  (0x200C, 0x200D),
  (0x2070, 0x218F),
  (0x2C00, 0x2FEF),
  (0x3001, 0xD7FF),
  (0xF900, 0xFDCF),
  (0xFDF0, 0xFFFD),
  (0x10000, 0xEFFFF)
]
const NameCharRanges = [ # + NameStartCharRanges
  (0xB7, 0xB7),
  (0x0300, 0x036F),
  (0x203F, 0x2040)
]
const NameStartCharAscii = {':', '_'} + AsciiAlpha
const NameCharAscii = NameStartCharAscii + {'-', '.'} + AsciiDigit
func matchNameProduction*(str: string): bool =
  if str.len == 0:
    return false
  # NameStartChar
  var i = 0
  var r: Rune
  if str[i] in Ascii:
    if str[i] notin NameStartCharAscii:
      return false
    inc i
  else:
    fastRuneAt(str, i, r)
    if not isInRange(NameStartCharRanges, int32(r)):
      return false
  # NameChar
  while i < str.len:
    if str[i] in Ascii:
      if str[i] notin NameCharAscii:
        return false
      inc i
    else:
      fastRuneAt(str, i, r)
      if not isInRange(NameStartCharRanges, int32(r)) and
          not isInMap(NameCharRanges, int32(r)):
        return false
  return true

func matchQNameProduction*(s: string): bool =
  if s.len == 0:
    return false
  if s[0] == ':':
    return false
  if s[^1] == ':':
    return false
  var colon = false
  for i in 1 ..< s.len - 1:
    if s[i] == ':':
      if colon:
        return false
      colon = true
  return s.matchNameProduction()

func utf16Len*(s: string): int =
  result = 0
  for r in s.runes:
    if uint32(r) < 0x10000: # ucs-2
      result += 1
    else: # surrogate
      result += 2

proc expandPath*(path: string): string =
  if path.len == 0 or path[0] != '~':
    return path
  if path.len == 1:
    return getHomeDir()
  elif path[1] == '/':
    return getHomeDir() / path.substr(2)
  else:
    when defined(posix):
      let usr = path.until({'/'}, 1)
      let p = getpwnam(cstring(usr))
      if p != nil:
        return $p.pw_dir / path.substr(usr.len)
    return path

func deleteChars*(s: string; todel: set[char]): string =
  var i = 0
  block earlyret:
    for j, c in s:
      if c in todel:
        i = j
        break earlyret
    return s
  var rs = newStringOfCap(s.len - 1)
  for j in 0 ..< i:
    rs &= s[j]
  for j in i + 1 ..< s.len:
    if s[j] in todel:
      continue
    rs &= s[j]
    inc i
  return rs

func replaceControls*(s: string): string =
  result = newStringOfCap(s.len)
  for c in s:
    if c in Controls - {' '}:
      result &= '^'
      result &= c.getControlLetter()
    else:
      result &= c

#https://html.spec.whatwg.org/multipage/form-control-infrastructure.html#multipart/form-data-encoding-algorithm
proc makeCRLF*(s: string): string =
  result = newStringOfCap(s.len)
  var i = 0
  while i < s.len - 1:
    if s[i] == '\r' and s[i + 1] != '\n':
      result &= '\r'
      result &= '\n'
    elif s[i] != '\r' and s[i + 1] == '\n':
      result &= s[i]
      result &= '\r'
      result &= '\n'
      inc i
    else:
      result &= s[i]
    inc i
  if i < s.len:
    if s[i] == '\r':
      result &= '\r'
      result &= '\n'
    else:
      result &= s[i]

func strictParseEnum*[T: enum](s: string): Opt[T] =
  # cmp when len is small enough, otherwise hashmap
  when {T.low..T.high}.len <= 4:
    for e in T.low .. T.high:
      if $e == s:
        return ok(e)
  else:
    const tab = (func(): Table[string, T] =
      result = initTable[string, T]()
      for e in T.low .. T.high:
        result[$e] = e
    )()
    if s in tab:
      return ok(tab[s])
  return err()

func parseEnumNoCase*[T: enum](s: string): Opt[T] =
  # cmp when len is small enough, otherwise hashmap
  when {T.low..T.high}.len <= 4:
    for e in T.low .. T.high:
      if $e.equalsIgnoreCase(s):
        return ok(e)
  else:
    const tab = (func(): Table[string, T] =
      result = initTable[string, T]()
      for e in T.low .. T.high:
        result[$e] = e
    )()
    if s in tab:
      return ok(tab[s])
  return err()

proc getContentTypeAttr*(contentType, attrname: string): string =
  var i = contentType.find(';')
  if i == -1:
    return ""
  i = contentType.find(attrname, i)
  if i == -1:
    return ""
  i = contentType.skipBlanks(i + attrname.len)
  if i >= contentType.len or contentType[i] != '=':
    return ""
  i = contentType.skipBlanks(i + 1)
  var q = false
  var s = ""
  for c in contentType.toOpenArray(i, contentType.high):
    if q:
      s &= c
      q = false
    elif c == '\\':
      q = true
    elif c in AsciiWhitespace + {';'}:
      break
    else:
      s &= c
  return s

func atob(c: char): uint8 {.inline.} =
  # see RFC 4648 table
  if c in AsciiUpperAlpha:
    return uint8(c) - uint8('A')
  if c in AsciiLowerAlpha:
    return uint8(c) - uint8('a') + 26
  if c in AsciiDigit:
    return uint8(c) - uint8('0') + 52
  if c == '+':
    return 62
  if c == '/':
    return 63
  return uint8.high

func atob0*(data: string): Result[string, string] =
  var outs = newStringOfCap(data.len div 4 * 3)
  var buf: array[4, uint8]
  var i = 0
  var j = 0
  var pad = 0
  while true:
    i = data.skipBlanks(i)
    if i >= data.len:
      break
    if data[i] == '=':
      i = data.skipBlanks(i + 1)
      inc pad
      break
    buf[j] = atob(data[i])
    if buf[j] == uint8.high:
      return err("Invalid character in encoded string")
    if j == 3:
      let ob1 = (buf[0] shl 2) or (buf[1] shr 4) # 6 bits of b0 | 2 bits of b1
      let ob2 = (buf[1] shl 4) or (buf[2] shr 2) # 4 bits of b1 | 4 bits of b2
      let ob3 = (buf[2] shl 6) or buf[3]         # 2 bits of b2 | 6 bits of b3
      outs &= char(ob1)
      outs &= char(ob2)
      outs &= char(ob3)
      j = 0
    else:
      inc j
    inc i
  if i < data.len:
    if i < data.len and data[i] == '=':
      inc pad
      inc i
    i = data.skipBlanks(i)
  if pad > 0 and j + pad != 4:
    return err("Too much padding")
  if i < data.len:
    return err("Invalid character after encoded string")
  if j == 3:
    let ob1 = (buf[0] shl 2) or (buf[1] shr 4) # 6 bits of b0 | 2 bits of b1
    let ob2 = (buf[1] shl 4) or (buf[2] shr 2) # 4 bits of b1 | 4 bits of b2
    outs &= char(ob1)
    outs &= char(ob2)
  elif j == 2:
    let ob1 = (buf[0] shl 2) or (buf[1] shr 4) # 6 bits of b0 | 2 bits of b1
    outs &= char(ob1)
  elif j != 0:
    return err("Incorrect number of characters in encoded string")
  return ok(outs)