summary refs log blame commit diff stats
path: root/lib/pure/collections/sets.nim
blob: 4af2b7ad4921e501ce691beacc2750cac3e12c24 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11


                                     
                                         






                                                                             
                                                                            
                                         



                  
                     

                             




                                                   
                                                              


                             













                                                                               









                                            
                                                      












                                                               
                                 






                                                                        
                                    





                                         
                                         

              




                                                                            
                                                      



                                                                 

                                             
                                                      
                            

                     
                                                                          

                 
                                 


                                       
                                                                 

                 
                               
                            
               

                                                     

                  
                                         
                            


                 

                                                     



                                       
                                                      

            

                                               

                                                                  

                                  

                                       
                                                      
                            



                                  

                                               

                                                                  

                                  


                                                                           
                                                      








                                                                    
                                             



                                                           
                                       







                                       
                                                      

              

                                                            
                               

                                                            



                                                 
                                                                            

                                                            



                                                    

                                                                               

                                                            



                              
 

                                                                
                               

                                                            



                                                       
                                                   


                        
                                                   


                               



                                                   
                                                     



                                          
                                                           

                                                            



                               




















                                                                         
                                                                           









                                                                          














                                                                              

                                                 
                    


                                                  
                    
 
                                                                   







                                                             
                                                      


                       
                                                



                                                    
                                                      
                            

                     
                                         






                                                                 
                                        







                                       
                                    




                                              
                                                      

            

                                                      

                                                                  

                                  


                                                                           
                                                      










                                                                            
                                                           





                                                                   
                                                      
              
 




















































                                                         



                            







                                                  




                                              



































                                                                 



                                                        
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## The ``sets`` module implements an efficient hash set and ordered hash set.
##
## **Note**: The data types declared here have *value semantics*: This means
## that ``=`` performs a copy of the set.

import
  os, hashes, math

{.pragma: myShallow.}
when not defined(nimhygiene):
  {.pragma: dirty.}

type
  TSlotEnum = enum seEmpty, seFilled, seDeleted
  TKeyValuePair[A] = tuple[slot: TSlotEnum, key: A]
  TKeyValuePairSeq[A] = seq[TKeyValuePair[A]]
  TSet* {.final, myShallow.}[A] = object ## a generic hash set
    data: TKeyValuePairSeq[A]
    counter: int

proc isValid*[A](s: TSet[A]): bool =
  ## Returns `true` if the set has been initialized with `initSet <#initSet>`_.
  ##
  ## Most operations over an uninitialized set will crash at runtime and
  ## `assert <system.html#assert>`_ in debug builds. You can use this proc in
  ## your own methods to verify that sets passed to your procs are correctly
  ## initialized. Example:
  ##
  ## .. code-block :: nimrod
  ##   proc savePreferences(options: TSet[string]) =
  ##     assert options.isValid, "Pass an initialized set!"
  ##     # Do stuff here, may crash in release builds!
  result = not s.data.isNil

proc len*[A](s: TSet[A]): int =
  ## returns the number of keys in `s`.
  result = s.counter

proc card*[A](s: TSet[A]): int =
  ## alias for `len`.
  result = s.counter

iterator items*[A](s: TSet[A]): A =
  ## iterates over any key in the table `t`.
  assert s.isValid, "The set needs to be initialized."
  for h in 0..high(s.data):
    if s.data[h].slot == seFilled: yield s.data[h].key

const
  growthFactor = 2

proc mustRehash(length, counter: int): bool {.inline.} =
  assert(length > counter)
  result = (length * 2 < counter * 3) or (length - counter < 4)

proc nextTry(h, maxHash: THash): THash {.inline.} =
  result = ((5 * h) + 1) and maxHash

template rawGetImpl() {.dirty.} =
  var h: THash = hash(key) and high(s.data) # start with real hash value
  while s.data[h].slot != seEmpty:
    if s.data[h].key == key and s.data[h].slot == seFilled:
      return h
    h = nextTry(h, high(s.data))
  result = -1

template rawInsertImpl() {.dirty.} =
  var h: THash = hash(key) and high(data)
  while data[h].slot == seFilled:
    h = nextTry(h, high(data))
  data[h].key = key
  data[h].slot = seFilled

proc rawGet[A](s: TSet[A], key: A): int =
  rawGetImpl()

proc mget*[A](s: var TSet[A], key: A): var A =
  ## returns the element that is actually stored in 's' which has the same
  ## value as 'key' or raises the ``EInvalidKey`` exception. This is useful
  ## when one overloaded 'hash' and '==' but still needs reference semantics
  ## for sharing.
  assert s.isValid, "The set needs to be initialized."
  var index = rawGet(s, key)
  if index >= 0: result = t.data[index].key
  else: raise newException(EInvalidKey, "key not found: " & $key)

proc contains*[A](s: TSet[A], key: A): bool =
  ## returns true iff `key` is in `s`.
  assert s.isValid, "The set needs to be initialized."
  var index = rawGet(s, key)
  result = index >= 0

proc rawInsert[A](s: var TSet[A], data: var TKeyValuePairSeq[A], key: A) =
  rawInsertImpl()

proc enlarge[A](s: var TSet[A]) =
  var n: TKeyValuePairSeq[A]
  newSeq(n, len(s.data) * growthFactor)
  for i in countup(0, high(s.data)):
    if s.data[i].slot == seFilled: rawInsert(s, n, s.data[i].key)
  swap(s.data, n)

template inclImpl() {.dirty.} =
  var index = rawGet(s, key)
  if index < 0:
    if mustRehash(len(s.data), s.counter): enlarge(s)
    rawInsert(s, s.data, key)
    inc(s.counter)

template containsOrInclImpl() {.dirty.} =
  var index = rawGet(s, key)
  if index >= 0:
    result = true
  else:
    if mustRehash(len(s.data), s.counter): enlarge(s)
    rawInsert(s, s.data, key)
    inc(s.counter)

proc incl*[A](s: var TSet[A], key: A) =
  ## includes an element `key` in `s`.
  assert s.isValid, "The set needs to be initialized."
  inclImpl()

proc incl*[A](s: var TSet[A], other: TSet[A]) =
  ## includes everything in `other` in `s`
  assert s.isValid, "The set `s` needs to be initialized."
  assert other.isValid, "The set `other` needs to be initialized."
  for item in other: incl(s, item)

proc excl*[A](s: var TSet[A], key: A) =
  ## excludes `key` from the set `s`.
  assert s.isValid, "The set needs to be initialized."
  var index = rawGet(s, key)
  if index >= 0:
    s.data[index].slot = seDeleted
    dec(s.counter)

proc excl*[A](s: var TSet[A], other: TSet[A]) =
  ## excludes everything in `other` from `s`.
  assert s.isValid, "The set `s` needs to be initialized."
  assert other.isValid, "The set `other` needs to be initialized."
  for item in other: excl(s, item)

proc containsOrIncl*[A](s: var TSet[A], key: A): bool =
  ## returns true if `s` contains `key`, otherwise `key` is included in `s`
  ## and false is returned.
  assert s.isValid, "The set needs to be initialized."
  containsOrInclImpl()

proc initSet*[A](initialSize=64): TSet[A] =
  ## creates a new hash set that is empty. `initialSize` needs to be
  ## a power of two.
  assert isPowerOfTwo(initialSize)
  result.counter = 0
  newSeq(result.data, initialSize)

proc toSet*[A](keys: openArray[A]): TSet[A] =
  ## creates a new hash set that contains the given `keys`.
  result = initSet[A](nextPowerOfTwo(keys.len+10))
  for key in items(keys): result.incl(key)

template dollarImpl(): stmt {.dirty.} =
  result = "{"
  for key in items(s):
    if result.len > 1: result.add(", ")
    result.add($key)
  result.add("}")

proc `$`*[A](s: TSet[A]): string =
  ## The `$` operator for hash sets.
  assert s.isValid, "The set needs to be initialized."
  dollarImpl()

proc union*[A](s1, s2: TSet[A]): TSet[A] =
  ## returns a new set of all items that are contained in at
  ## least one of `s1` and `s2`
  assert s1.isValid, "The set `s1` needs to be initialized."
  assert s2.isValid, "The set `s2` needs to be initialized."
  result = s1
  incl(result, s2)

proc intersection*[A](s1, s2: TSet[A]): TSet[A] =
  ## returns a new set of all items that are contained in both `s1` and `s2`
  assert s1.isValid, "The set `s1` needs to be initialized."
  assert s2.isValid, "The set `s2` needs to be initialized."
  result = initSet[A](min(s1.data.len, s2.data.len))
  for item in s1:
    if item in s2: incl(result, item)

proc difference*[A](s1, s2: TSet[A]): TSet[A] =
  ## returns a new set of all items that are contained in `s1`, but not in `s2`
  assert s1.isValid, "The set `s1` needs to be initialized."
  assert s2.isValid, "The set `s2` needs to be initialized."
  result = initSet[A]()
  for item in s1:
    if not contains(s2, item):
      incl(result, item)

proc symmetricDifference*[A](s1, s2: TSet[A]): TSet[A] =
  ## returns a new set of all items that are contained in either
  ## `s1` or `s2`, but not both
  assert s1.isValid, "The set `s1` needs to be initialized."
  assert s2.isValid, "The set `s2` needs to be initialized."
  result = s1
  for item in s2:
    if containsOrIncl(result, item): excl(result, item)

proc `+`*[A](s1, s2: TSet[A]): TSet[A] {.inline.} =
  ## alias for `union`
  result = union(s1, s2)

proc `*`*[A](s1, s2: TSet[A]): TSet[A] {.inline.} =
  ## alias for `intersection`
  result = intersection(s1, s2)

proc `-`*[A](s1, s2: TSet[A]): TSet[A] {.inline.} =
  ## alias for `difference`
  result = difference(s1, s2)

proc `-+-`*[A](s1, s2: TSet[A]): TSet[A] {.inline.} =
  ## alias for `symmetricDifference`
  result = symmetricDifference(s1, s2)

proc disjoint*[A](s1, s2: TSet[A]): bool =
  ## returns true iff `s1` and `s2` have no items in common
  assert s1.isValid, "The set `s1` needs to be initialized."
  assert s2.isValid, "The set `s2` needs to be initialized."
  for item in s1:
    if item in s2: return false
  return true

proc `<`*[A](s, t: TSet[A]): bool =
  ## Is s a strict subset of t?
  s.counter != t.counter and s <= t

proc `<=`*[A](s, t: TSet[A]): bool =
  ## Is s a subset of t?
  result = false
  if s.counter > t.counter: return
  result = true
  for item in s:
    if not(t.contains(item)):
      result = false
      return

proc `==`*[A](s, t: TSet[A]): bool =
  s.counter == t.counter and s <= t

proc map*[A, B](data: TSet[A], op: proc (x: A): B {.closure.}): TSet[B] =
  result = initSet[B]()
  for item in data: result.incl(op(item))

# ------------------------------ ordered set ------------------------------

type
  TOrderedKeyValuePair[A] = tuple[
    slot: TSlotEnum, next: int, key: A]
  TOrderedKeyValuePairSeq[A] = seq[TOrderedKeyValuePair[A]]
  TOrderedSet* {.
      final, myShallow.}[A] = object ## set that remembers insertion order
    data: TOrderedKeyValuePairSeq[A]
    counter, first, last: int

proc isValid*[A](s: TOrderedSet[A]): bool =
  ## Returns `true` if the ordered set has been initialized with `initSet
  ## <#initOrderedSet>`_.
  ##
  ## Most operations over an uninitialized ordered set will crash at runtime
  ## and `assert <system.html#assert>`_ in debug builds. You can use this proc
  ## in your own methods to verify that ordered sets passed to your procs are
  ## correctly initialized. Example:
  ##
  ## .. code-block :: nimrod
  ##   proc saveTarotCards(cards: TOrderedSet[int]) =
  ##     assert cards.isValid, "Pass an initialized set!"
  ##     # Do stuff here, may crash in release builds!
  result = not s.data.isNil

proc len*[A](s: TOrderedSet[A]): int {.inline.} =
  ## returns the number of keys in `s`.
  result = s.counter

proc card*[A](s: TOrderedSet[A]): int {.inline.} =
  ## alias for `len`.
  result = s.counter

template forAllOrderedPairs(yieldStmt: stmt) {.dirty, immediate.} =
  var h = s.first
  while h >= 0:
    var nxt = s.data[h].next
    if s.data[h].slot == seFilled: yieldStmt
    h = nxt

iterator items*[A](s: TOrderedSet[A]): A =
  ## iterates over any key in the set `s` in insertion order.
  assert s.isValid, "The set needs to be initialized."
  forAllOrderedPairs:
    yield s.data[h].key

proc rawGet[A](s: TOrderedSet[A], key: A): int =
  rawGetImpl()

proc contains*[A](s: TOrderedSet[A], key: A): bool =
  ## returns true iff `key` is in `s`.
  assert s.isValid, "The set needs to be initialized."
  var index = rawGet(s, key)
  result = index >= 0

proc rawInsert[A](s: var TOrderedSet[A], 
                  data: var TOrderedKeyValuePairSeq[A], key: A) =
  rawInsertImpl()
  data[h].next = -1
  if s.first < 0: s.first = h
  if s.last >= 0: data[s.last].next = h
  s.last = h

proc enlarge[A](s: var TOrderedSet[A]) =
  var n: TOrderedKeyValuePairSeq[A]
  newSeq(n, len(s.data) * growthFactor)
  var h = s.first
  s.first = -1
  s.last = -1
  while h >= 0:
    var nxt = s.data[h].next
    if s.data[h].slot == seFilled: 
      rawInsert(s, n, s.data[h].key)
    h = nxt
  swap(s.data, n)

proc incl*[A](s: var TOrderedSet[A], key: A) =
  ## includes an element `key` in `s`.
  assert s.isValid, "The set needs to be initialized."
  inclImpl()

proc incl*[A](s: var TSet[A], other: TOrderedSet[A]) =
  ## includes everything in `other` in `s`
  assert s.isValid, "The set `s` needs to be initialized."
  assert other.isValid, "The set `other` needs to be initialized."
  for item in other: incl(s, item)

proc containsOrIncl*[A](s: var TOrderedSet[A], key: A): bool =
  ## returns true if `s` contains `key`, otherwise `key` is included in `s`
  ## and false is returned.
  assert s.isValid, "The set needs to be initialized."
  containsOrInclImpl()

proc initOrderedSet*[A](initialSize=64): TOrderedSet[A] =
  ## creates a new ordered hash set that is empty. `initialSize` needs to be
  ## a power of two.
  assert isPowerOfTwo(initialSize)
  result.counter = 0
  result.first = -1
  result.last = -1
  newSeq(result.data, initialSize)

proc toOrderedSet*[A](keys: openArray[A]): TOrderedSet[A] =
  ## creates a new ordered hash set that contains the given `keys`.
  result = initOrderedSet[A](nextPowerOfTwo(keys.len+10))
  for key in items(keys): result.incl(key)

proc `$`*[A](s: TOrderedSet[A]): string =
  ## The `$` operator for ordered hash sets.
  assert s.isValid, "The set needs to be initialized."
  dollarImpl()

proc testModule() =
  ## Internal micro test to validate docstrings and such.
  block isValidTest:
    var options: TSet[string]
    proc savePreferences(options: TSet[string]) =
      assert options.isValid, "Pass an initialized set!"
    options = initSet[string]()
    options.savePreferences

  block lenTest:
    var values: TSet[int]
    assert(not values.isValid)
    assert values.len == 0
    assert values.card == 0

  block setIterator:
    type pair = tuple[a, b: int]
    var a, b = initSet[pair]()
    a.incl((2, 3))
    a.incl((3, 2))
    a.incl((2, 3))
    for x, y in a.items:
      b.incl((x - 2, y + 1))
    assert a.len == b.card
    assert a.len == 2
    echo b

  block setContains:
    var values = initSet[int]()
    assert(not values.contains(2))
    values.incl(2)
    assert values.contains(2)
    values.excl(2)
    assert(not values.contains(2))

  block toSeqAndString:
    var a = toSet[int]([2, 4, 5])
    var b = initSet[int]()
    for x in [2, 4, 5]: b.incl(x)
    assert($a == $b)

  block setOperations:
    var
      a = toset[string](["a", "b"])
      b = toset[string](["b", "c"])
      c = union(a, b)
    assert c == toSet[string](["a", "b", "c"])
    var d = intersection(a, b)
    assert d == toSet[string](["b"])
    var e = difference(a, b)
    assert e == toSet[string](["a"])
    var f = symmetricDifference(a, b)
    assert f == toSet[string](["a", "c"])
    assert d < a and d < b
    assert((a < a) == false)
    assert d <= a and d <= b
    assert((a <= a))
    # Alias test.
    assert a + b == toSet[string](["a", "b", "c"])
    assert a * b == toSet[string](["b"])
    assert a - b == toSet[string](["a"])
    assert a -+- b == toSet[string](["a", "c"])
    assert disjoint(a, b) == false
    assert disjoint(a, b - a) == true

  block mapSet:
    var a = toSet[int]([1, 2, 3])
    var b = a.map(proc (x: int): string = $x)
    assert b == toSet[string](["1", "2", "3"])

  block isValidTest:
    var cards: TOrderedSet[string]
    proc saveTarotCards(cards: TOrderedSet[string]) =
      assert cards.isValid, "Pass an initialized set!"
    cards = initOrderedSet[string]()
    cards.saveTarotCards

  block lenTest:
    var values: TOrderedSet[int]
    assert(not values.isValid)
    assert values.len == 0
    assert values.card == 0

  block setIterator:
    type pair = tuple[a, b: int]
    var a, b = initOrderedSet[pair]()
    a.incl((2, 3))
    a.incl((3, 2))
    a.incl((2, 3))
    for x, y in a.items:
      b.incl((x - 2, y + 1))
    assert a.len == b.card
    assert a.len == 2

  block setContains:
    var values = initOrderedSet[int]()
    assert(not values.contains(2))
    values.incl(2)
    assert values.contains(2)

  block toSeqAndString:
    var a = toOrderedSet[int]([2, 4, 5])
    var b = initOrderedSet[int]()
    for x in [2, 4, 5]: b.incl(x)
    assert($a == $b)
    # assert(a == b) # https://github.com/Araq/Nimrod/issues/1413

  echo "Micro tests run successfully."

when isMainModule and not defined(release): testModule()