From 4d7361009863a0ea2bd75919a79574cdc213c72b Mon Sep 17 00:00:00 2001
From: Wojtek Kosior <kwojtus@protonmail.com>
Date: Tue, 23 Jul 2019 13:47:34 +0200
Subject: initial commit

---
 hashtable.c | 502 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 502 insertions(+)
 create mode 100644 hashtable.c

(limited to 'hashtable.c')

diff --git a/hashtable.c b/hashtable.c
new file mode 100644
index 0000000..c2dc582
--- /dev/null
+++ b/hashtable.c
@@ -0,0 +1,502 @@
+// GENERAL INFO
+
+// You might want to read the beginning of hashtable.h first.
+
+// In some places "rehashing" and in other ones "resizing" seemed to
+// be the right word to use. They mean more or less the same.
+
+// Functions starting with ht_ are part of the API. Internal functions
+// are declared static. I also made some of them inline (either
+// because they were extremely short or only called from 1 place).
+
+// Hashtable size is always a power of 2.
+
+// When the hashtable is ¾ full, a new, 2x bigger table is allocated
+// and whenever one of 4 basic operations (adding, removing, setting,
+// getting) occurs, 4 slots are being rehashed from old table into 8
+// slots in new table. Similarly, when hashtable is ¼ full, a new,
+// 2x smaller table is allocated and each of subsequent operations
+// rehashes 8 entries from old table into 4 in new table.
+// This mechanism has been made lazier: getting and removing don't
+// trigger growing of ht even if it's 3/4 full. Similarly, getting,
+// setting and adding don't trigger shrinking.
+// Once resizing is triggered, however, any of the operations will
+// contribute to rehashing. Even if, for example, the operation is
+// ADD and the table is being shrinked.
+// This means, that if we have a hashtable of size n which is ¾ full
+// and growing is triggered, then each subsequent call to
+// ht_{add,rem,get,set}() rehashes some entries and, depending on
+// how frequently and how successfully each of these 4 funcs was
+// called, at the end of resizing we get a size 2n hashtable which is
+// between ¼ and ½ full. Similarly, if shrinking of a ¼ full
+// hashtable of size n is triggered, then after some operations we
+// get a size ½n hashtable, that is somewhere between ¼ and ¾ full.
+// One can see now, that we always keep the hashtable between ¼ and ¾
+// full (with the exception of a minimal size one, that can be empty).
+
+#include "hashtable.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+// We won't shrink hashtable below this size. Newly created one will
+// be this big.
+#define MIN_SIZE 32
+
+// Special value of ht->rehashing_position.
+#define NOT_REHASHING ((ssize_t) -1)
+
+// Those are possible return values of do_resizing_related_stuff()
+// and rehash_some_entries() (which only returns the first 2).
+#define REHASHING_IN_PROGRESS     0
+#define REHASHING_NOT_IN_PROGRESS 1
+#define REHASHING_NO_MEM          2
+
+enum op
+  {
+    GET,
+    ADD,
+    SET,
+    REM,
+  };
+
+int ht_init(hashtable_t *ht,
+	    size_t (*hash)(void *key),
+	    int (*cmp)(void *key1, void *key2))
+{
+  if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**))))
+    return HT_NO_MEM;
+
+  ht->tab_size = MIN_SIZE;
+  ht->rehashing_position = NOT_REHASHING;
+  ht->entries = 0;
+  ht->hashfunc = hash;
+  ht->cmpfunc = cmp;
+  
+  return HT_OK;
+}
+
+// First come some utilities :)
+
+static inline size_t min(size_t n1, size_t n2)
+{
+  return n1 < n2 ? n1 : n2;
+}
+
+static inline size_t hash2(size_t n)
+{
+  // I found this "hash improver" on the internet.
+  n ^= (n >> 20) ^ (n >> 12);
+  return n ^ (n >> 7) ^ (n >> 4);
+}
+
+// Below are 2 list-handling utility functions.
+static inline struct ht_node *join_lists(struct ht_node *l1,
+					 struct ht_node *l2)
+{
+  if (!l1) return l2;
+  if (!l2) return l1;
+
+  struct ht_node *l1_last;
+
+  for (l1_last = l1; l1_last->next; l1_last = l1_last->next);
+
+  // Append l2 to the end of l1.
+  l1_last->next = l2;
+
+  // For convenience return the first element of the resulting list.
+  return l1;
+}
+
+
+static inline void push(struct ht_node *node, struct ht_node **list)
+{
+  node->next = *list;
+  *list = node;
+}
+
+// The following 2 rehash_* functions are helpers of
+// rehash_some_entries().
+// This func rehashes 1 chain of entries in tab[]
+// into 2 chains in newtab[].
+static inline void rehash_position_growing(hashtable_t *ht)
+{
+  // There are 2 possible new positions of an entry in a 2x bigger ht.
+  struct ht_node *list0 = NULL, *list1 = NULL;
+  
+  size_t old_position = ht->rehashing_position,
+    new_position0 = old_position,
+    new_position1 = old_position | ht->tab_size;
+  
+  struct ht_node *pair = ht->tab[old_position], *next_pair;
+
+  while (pair)
+    {
+      next_pair = pair->next;
+
+      size_t new_position = hash2(ht->hashfunc(pair->key))
+	& (ht->new_size - 1);
+
+      push(pair, new_position == new_position1 ? &list1 : &list0);
+
+      pair = next_pair;
+    }
+
+  ht->newtab[new_position0] = list0;
+  ht->newtab[new_position1] = list1;
+
+  ht->rehashing_position++;
+}
+
+// This func rehashes 2 chains of entries in tab[]
+// into 1 chain in newtab[].
+static inline void rehash_2positions_shrinking(hashtable_t *ht)
+{
+  size_t new_position = ht->rehashing_position,
+    old_position0 = new_position,
+    old_position1 = new_position | ht->new_size;
+  
+  ht->newtab[new_position] = join_lists(ht->tab[old_position0],
+					ht->tab[old_position1]);
+
+  ht->rehashing_position++;
+}
+
+// Rehashes 4(8) positions from tab to newtab. If those were the last
+// enetries to rehash, the function takes care of everything
+// (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS.
+// Otherwise, returns REHASHING_IN_PROGRESS.
+// Caller must make sure rehashing was started b4 calling this func.
+static int rehash_some_entries(hashtable_t *ht)
+{
+  int rehashes_left = 4;
+  
+  if (ht->new_size > ht->tab_size)                    // growing ht
+    {
+      while(rehashes_left--) rehash_position_growing(ht);
+      if (ht->rehashing_position != ht->tab_size)
+	return REHASHING_IN_PROGRESS;
+    }
+  else                                                // shrinking ht
+    {
+      while(rehashes_left--) rehash_2positions_shrinking(ht);
+      if (ht->rehashing_position != ht->new_size)
+	return REHASHING_IN_PROGRESS;
+    }
+
+  // rehashing finishes
+  ht->rehashing_position = NOT_REHASHING;
+  ht->tab_size = ht->new_size;
+  free(ht->tab);
+  ht->tab = ht->newtab;
+  
+  return REHASHING_NOT_IN_PROGRESS;
+}
+
+static inline bool resizing_taking_place(hashtable_t *ht)
+{
+  return !(ht->rehashing_position == NOT_REHASHING);
+}
+
+void ht_finish_resizing(hashtable_t *ht)
+{
+  if (resizing_taking_place(ht))
+    while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS);
+}
+
+static inline bool needs_growing(hashtable_t *ht)
+{
+  return ht->entries == 3 * ht->tab_size / 4;
+}
+
+static inline bool needs_shrinking(hashtable_t *ht)
+{
+  return ht->tab_size > MIN_SIZE
+    && ht->entries == ht->tab_size / 4;
+}
+
+// Each of hashtable operations (add, set, rem, get) should also
+// attempt to do part of resizing. This way resizing operation
+// which is O(n) is distributed among many hashtable accesses
+// each of them still being O(1). Without this the the amortized
+// complexity of ht accesses would still be O(1), but a single access
+// would sometimes be O(n).
+// Other function that adds, sets, gets or removes sth from ht uses
+// this one to do this "part of resizing" mentioned above.
+// This func returns REHASHING_NO_MEM on failed malloc (won't happen
+// for GET operation) and REHASHING_(NOT_)IN_PROGRESS otherwise.
+static inline int do_resizing_related_stuff(hashtable_t *ht,
+					      void *key, enum op op)
+{
+  bool resizing = resizing_taking_place(ht);
+  
+  if (!resizing)
+    {
+      size_t new_size;
+
+      switch (op)
+	{
+	case GET:
+	  goto dont_start_resizing;
+	case ADD:
+	case SET:
+	  if (needs_growing(ht))
+	    new_size = ht->tab_size * 2;
+	  else
+	    goto dont_start_resizing;
+	  break;
+	default: // case REM
+	  if (needs_shrinking(ht))
+	    new_size = ht->tab_size / 2;
+	  else
+	    goto dont_start_resizing;
+	}
+  
+      struct ht_node **newtab;
+      if (!(newtab = malloc(new_size * sizeof(struct ht_node*))))
+	return REHASHING_NO_MEM;
+  
+      ht->newtab = newtab;
+      ht->new_size = new_size;
+      ht->rehashing_position = 0;
+
+      resizing = true;
+    }
+  
+ dont_start_resizing:
+
+  return resizing ?
+    rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS;
+}
+
+// This is a chaining hashtable, so each element in the array (table)
+// is actually a list of entries. All operations (adding, removing,
+// etc.) need to find the right list of entries (here called "bucket")
+// for a given key first, so it makes sense to do it in a separate
+// function. The bucket may be in tab or newtab if resizing is taking
+// place. Being informed by the caller if resizing is in progress,
+// this func does not need to check for it itself.
+static inline struct ht_node **find_bucket(hashtable_t *ht, void *key,
+					   bool resizing_in_progress)
+{
+  size_t hash = hash2(ht->hashfunc(key)),
+    destination_tab_size, position;
+  
+  struct ht_node **destination_tab;
+  
+  if (resizing_in_progress)
+    // Here we must check whether our key's bucket is still
+    // in ht->tab or already rehashed to ht->newtab.
+    {
+      size_t smaller_tab_size = min(ht->tab_size, ht->new_size),
+	smaller_tab_position = hash & (smaller_tab_size - 1);
+
+      if (smaller_tab_position < ht->rehashing_position)
+	{
+	  destination_tab = ht->newtab;
+	  destination_tab_size = ht->new_size;
+	}
+      else
+	{
+	  destination_tab = ht->tab;
+	  destination_tab_size = ht->tab_size;
+	}
+    }
+  else
+    // In this case we know, we're working on ht->tab and not newtab.
+    {
+      destination_tab = ht->tab;
+      destination_tab_size = ht->tab_size;
+    }
+
+  position = hash & (destination_tab_size - 1);
+  return &destination_tab[position];
+}
+
+// Operations of adding, removing, etc. all work on list of entries
+// (bucket) to wchich key hashes and they have some common logic, so
+// it made sense to make a single function, that does the right
+// operation based on an enum passed to it.
+static inline int
+perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket,
+			    void *key, void *val,
+			    void **keyptr, void **valptr,
+			    enum op op)
+{
+  for (struct ht_node **pairptr = bucket, *pair = *pairptr;
+       pair;
+       pairptr = &pair->next, pair = pair->next)
+    
+    if (!ht->cmpfunc(key, pair->key))
+      {
+	if (op == ADD)
+	  return HT_KEY_PRESENT;
+	
+	if (keyptr) *keyptr = pair->key;
+	if (valptr) *valptr = pair->val;
+	
+	switch (op)
+	  {
+	  case GET:
+	    {
+	      return HT_OK;
+	    }
+	  case SET:
+	    {
+	      pair->key = key;
+	      pair->val = val;
+	      return HT_OK;
+	    }
+	  default: // case REM
+	    {
+	      *pairptr = pair->next;
+	      free(pair);
+	      ht->entries--;
+	      return HT_OK;
+	    }
+	  }
+      }
+
+  if (op == GET || op == REM)
+    return HT_KEY_ABSENT;
+
+  // op == ADD || op == SET
+
+  struct ht_node *new_pair = malloc(sizeof(struct ht_node));
+  if (!new_pair)
+    return HT_NO_MEM;
+
+  *new_pair = (struct ht_node) {.key = key, .val = val};
+  push(new_pair, bucket);
+  ht->entries++;
+
+  return HT_OK;
+}
+
+// Generic function for performing of adding, removing, setting and
+// getting.
+static int perform_operation(hashtable_t *ht, void *key, void *val,
+			     void **keyptr, void **valptr, enum op op)
+{
+  bool resizing_in_progress;
+
+  switch (do_resizing_related_stuff(ht, key, op))
+    {
+    case REHASHING_IN_PROGRESS:
+      resizing_in_progress = true;
+      break;
+    case REHASHING_NOT_IN_PROGRESS:
+      resizing_in_progress = false;
+      break;
+    default: // case REHASHING_NO_MEM
+      return HT_NO_MEM;
+    }
+
+  struct ht_node **bucket =
+    find_bucket(ht, key, resizing_in_progress);
+
+  return perform_operation_on_bucket(ht, bucket, key, val,
+				     keyptr, valptr, op);
+}
+
+// The 4 functions below are the main part of the API.
+int ht_get(hashtable_t *ht, void *key,
+	   void **storedkey, void **val)
+{
+  return perform_operation(ht, key, NULL, storedkey, val, GET);
+}
+
+int ht_add(hashtable_t *ht, void *key, void *val)
+{
+  return perform_operation(ht, key, val, NULL, NULL, ADD);
+}
+
+int ht_set(hashtable_t *ht, void *key, void *val,
+	   void **oldkey, void **oldval)
+{
+  return perform_operation(ht, key, val, oldkey, oldval, SET);
+}
+
+int ht_rem(hashtable_t *ht, void *key,
+	   void **storedkey, void **val)
+{
+  return perform_operation(ht, key, NULL, storedkey, val, REM);
+}
+
+// As mentioned in hashtable.h, this func does not deallocate keys
+// nor vals. One could use ht_map() to free() them before calling this
+// func.
+void ht_destroy(hashtable_t *ht)
+{
+  if (!ht->entries) return;
+  ht_finish_resizing(ht);
+
+  struct ht_node **tab = ht->tab;
+  
+  for (ssize_t position = ht->tab_size - 1;
+       position >= 0; position--)
+    {
+      struct ht_node *pair = tab[position], *nextpair;
+      
+      while (pair)
+	{
+	  nextpair = pair->next;
+	  free(pair);
+	  pair = nextpair;
+	}
+    }
+  
+  free(ht->tab);
+}
+
+void ht_map(hashtable_t *ht, void *arg,
+	    void (*mapfunc)(void *key, void *val, void *arg))
+{
+  if (!ht->entries) return;
+  ht_finish_resizing(ht);
+
+  struct ht_node **tab = ht->tab;
+  
+  for (ssize_t position = ht->tab_size - 1;
+       position >= 0; position--)
+    
+    for (struct ht_node *pair = tab[position];
+	 pair; pair = pair->next)
+
+      mapfunc(pair->key, pair->val, arg);
+}
+
+void ht_map_destroy(hashtable_t *ht, void *arg,
+		    void (*mapfunc)(void *key, void *val, void *arg))
+{
+  // If mapfunc() deallocates keys, then the next 2 lines require
+  // assumption on ht_destroy(), that it doesn't call ht->hashfunc()
+  // or ht->cmpfunc() on keys.
+  ht_map(ht, arg, mapfunc);
+  ht_destroy(ht);
+}
+
+// These 2 functions are for easy making of hashtable with strings as
+// keys.
+size_t ht_string_hash(const char *key)
+{
+  size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4;
+
+  do
+    {
+      char shift = ((unsigned char) key[i]) % sizeof(size_t);
+      hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift)))
+	^ key[i];
+    }
+  while (key[i++]);
+
+  return hash;
+}
+
+int ht_string_init(hashtable_t *ht)
+{
+  return ht_init(ht, (size_t (*)(void*)) &ht_string_hash,
+		 (int (*)(void*, void*)) &strcmp);
+}
-- 
cgit v1.2.3