1 files changed, 565 insertions, 0 deletions
diff --git a/hashtable.c b/hashtable.c
new file mode 100644
index 0000000..a397879
--- /dev/null
+++ b/hashtable.c
@@ -0,0 +1,565 @@
+/**
+ * C hashtable implementation
+ *
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ */
+
+/*
+ * GENERAL INFO
+ *
+ * You might want to read the beginning of hashtable.h first.
+ *
+ * In some places "rehashing" and in other ones "resizing" seemed to
+ * be the right word to use. They mean more or less the same.
+ *
+ * Functions starting with ht_ are part of the API. Internal functions
+ * are declared static. I also made some of them inline (either
+ * because they were extremely short or only called from 1 place).
+ *
+ * Hashtable size is always a power of 2.
+ *
+ * When the hashtable is ¾ full, a new, 2x bigger table is allocated
+ * and whenever one of 4 basic operations (adding, removing, setting,
+ * getting) occurs, 4 slots are being rehashed from old table into 8
+ * slots in new table. Similarly, when hashtable is ¼ full, a new,
+ * 2x smaller table is allocated and each of subsequent operations
+ * rehashes 8 entries from old table into 4 in new table.
+ * This mechanism has been made lazier: getting and removing don't
+ * trigger growing of ht even if it's 3/4 full. Similarly, getting,
+ * setting and adding don't trigger shrinking.
+ * Once resizing is triggered, however, any of the operations will
+ * contribute to rehashing. Even if, for example, the operation is
+ * ADD and the table is being shrinked.
+ * This means, that if we have a hashtable of size n which is ¾ full
+ * and growing is triggered, then each subsequent call to
+ * ht_{add,rem,get,set}() rehashes some entries and, depending on
+ * how frequently and how successfully each of these 4 funcs was
+ * called, at the end of resizing we get a size 2n hashtable which is
+ * between ¼ and ½ full. Similarly, if shrinking of a ¼ full
+ * hashtable of size n is triggered, then after some operations we
+ * get a size ½n hashtable, that is somewhere between ¼ and ¾ full.
+ * One can see now, that we always keep the hashtable between ¼ and ¾
+ * full (with the exception of a minimal size one, that can be empty).
+ */
+
+#include "hashtable.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#ifdef _USE_INLINE
+#define INLINE inline
+#else
+#define INLINE
+#endif
+
+/*
+ * We won't shrink hashtable below this size. Newly created one will
+ * be this big.
+ */
+#define MIN_SIZE 4
+
+/* Special value of ht->rehashing_position. */
+#define NOT_REHASHING ((ssize_t) -1)
+
+/*
+ * Those are possible return values of do_resizing_related_stuff()
+ * and rehash_some_entries() (which only returns the first 2).
+ */
+#define REHASHING_IN_PROGRESS     0
+#define REHASHING_NOT_IN_PROGRESS 1
+#define REHASHING_NO_MEM          2
+
+/* Struct used to store a pair. */
+struct ht_node
+{
+  const void *key;
+  const void *val;
+  struct ht_node *next;
+};
+
+enum op
+  {
+    GET,
+    GET_THREADSAFE,
+    ADD,
+    SET,
+    REM,
+  };
+
+int ht_init(hashtable_t *ht,
+	    size_t (*hash)(const void *key),
+	    int (*cmp)(const void *key1, const void *key2))
+{
+  if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**))))
+    return HT_NO_MEM;
+
+  ht->tab_size = MIN_SIZE;
+  ht->rehashing_position = NOT_REHASHING;
+  ht->entries = 0;
+  ht->hashfunc = hash;
+  ht->cmpfunc = cmp;
+
+  return HT_OK;
+}
+
+/* First come some utilities :) */
+
+static INLINE size_t min(size_t n1, size_t n2)
+{
+  return n1 < n2 ? n1 : n2;
+}
+
+static INLINE size_t hash2(size_t n)
+{
+  /* I found this "hash improver" on the internet. */
+  n ^= (n >> 20) ^ (n >> 12);
+  return n ^ (n >> 7) ^ (n >> 4);
+}
+
+/* Below are 2 list-handling utility functions. */
+static INLINE struct ht_node *join_lists(struct ht_node *l1,
+					 struct ht_node *l2)
+{
+  if (!l1) return l2;
+  if (!l2) return l1;
+
+  struct ht_node *l1_last;
+
+  for (l1_last = l1; l1_last->next; l1_last = l1_last->next);
+
+  /* Append l2 to the end of l1. */
+  l1_last->next = l2;
+
+  /* For convenience return the first element of the resulting list. */
+  return l1;
+}
+
+
+static INLINE void push(struct ht_node *node, struct ht_node **list)
+{
+  node->next = *list;
+  *list = node;
+}
+
+/*
+ * The following 2 rehash_* functions are helpers of rehash_some_entries().
+ * This func rehashes 1 chain of entries in tab[] into 2 chains in newtab[].
+ */
+static INLINE void rehash_position_growing(hashtable_t *ht)
+{
+  /* There are 2 possible new positions of an entry in a 2x bigger ht. */
+  struct ht_node *list0 = NULL, *list1 = NULL;
+
+  size_t old_position = ht->rehashing_position,
+    new_position0 = old_position,
+    new_position1 = old_position | ht->tab_size;
+
+  struct ht_node *pair = ht->tab[old_position], *next_pair;
+
+  while (pair)
+    {
+      next_pair = pair->next;
+
+      size_t new_position = hash2(ht->hashfunc(pair->key))
+	& (ht->new_size - 1);
+
+      push(pair, new_position == new_position1 ? &list1 : &list0);
+
+      pair = next_pair;
+    }
+
+  ht->newtab[new_position0] = list0;
+  ht->newtab[new_position1] = list1;
+
+  ht->rehashing_position++;
+}
+
+/* This func rehashes 2 chains of entries in tab[] into 1 chain in newtab[]. */
+static INLINE void rehash_2positions_shrinking(hashtable_t *ht)
+{
+  size_t new_position = ht->rehashing_position,
+    old_position0 = new_position,
+    old_position1 = new_position | ht->new_size;
+
+  ht->newtab[new_position] = join_lists(ht->tab[old_position0],
+					ht->tab[old_position1]);
+
+  ht->rehashing_position++;
+}
+
+/*
+ * Rehashes 4(8) positions from tab to newtab. If those were the last
+ * enetries to rehash, the function takes care of everything
+ * (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS.
+ * Otherwise, returns REHASHING_IN_PROGRESS.
+ * Caller must make sure rehashing was started b4 calling this func.
+ */
+static int rehash_some_entries(hashtable_t *ht)
+{
+  int rehashes_left = 4;
+
+  if (ht->new_size > ht->tab_size)                    /* growing ht */
+    {
+      while(rehashes_left--) rehash_position_growing(ht);
+      if (ht->rehashing_position != ht->tab_size)
+	return REHASHING_IN_PROGRESS;
+    }
+  else                                                /* shrinking ht */
+    {
+      while(rehashes_left--) rehash_2positions_shrinking(ht);
+      if (ht->rehashing_position != ht->new_size)
+	return REHASHING_IN_PROGRESS;
+    }
+
+  /* rehashing finishes */
+  ht->rehashing_position = NOT_REHASHING;
+  ht->tab_size = ht->new_size;
+  free(ht->tab);
+  ht->tab = ht->newtab;
+
+  return REHASHING_NOT_IN_PROGRESS;
+}
+
+static INLINE bool resizing_taking_place(hashtable_t *ht)
+{
+  return !(ht->rehashing_position == NOT_REHASHING);
+}
+
+void ht_finish_resizing(hashtable_t *ht)
+{
+  if (resizing_taking_place(ht))
+    while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS);
+}
+
+static INLINE bool needs_growing(hashtable_t *ht)
+{
+  return ht->entries >= 3 * ht->tab_size / 4;
+}
+
+static INLINE bool needs_shrinking(hashtable_t *ht)
+{
+  return ht->tab_size > MIN_SIZE
+    && ht->entries <= ht->tab_size / 4;
+}
+/*
+ * Each of hashtable operations (add, set, rem, get) should also
+ * attempt to do part of resizing. This way resizing operation
+ * which is O(n) is distributed among many hashtable accesses
+ * each of them still being O(1). Without this the the amortized
+ * complexity of ht accesses would still be O(1), but a single access
+ * would sometimes be O(n).
+ * Other function that adds, sets, gets or removes sth from ht uses
+ * this one to do this "part of resizing" mentioned above.
+ * This func returns REHASHING_NO_MEM on failed malloc (won't happen
+ * for GET or REM operation) and REHASHING_[NOT_]IN_PROGRESS otherwise.
+ */
+static INLINE int
+do_resizing_related_stuff(hashtable_t *ht, const void *key, enum op op)
+{
+  bool resizing = resizing_taking_place(ht);
+
+  if (!resizing)
+    {
+      size_t new_size;
+
+      switch (op)
+	{
+	case GET:
+	  goto dont_start_resizing;
+	case ADD:
+	case SET:
+	  if (needs_growing(ht))
+	    new_size = ht->tab_size * 2;
+	  else
+	    goto dont_start_resizing;
+	  break;
+	default: /* case REM */
+	  if (needs_shrinking(ht))
+	    new_size = ht->tab_size / 2;
+	  else
+	    goto dont_start_resizing;
+	}
+
+      struct ht_node **newtab;
+      if (!(newtab = malloc(new_size * sizeof(struct ht_node*))))
+	return op == REM ? REHASHING_NOT_IN_PROGRESS : REHASHING_NO_MEM;
+
+      ht->newtab = newtab;
+      ht->new_size = new_size;
+      ht->rehashing_position = 0;
+
+      resizing = true;
+    }
+
+ dont_start_resizing:
+
+  return resizing ?
+    rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS;
+}
+
+/*
+ * This is a chaining hashtable, so each element in the array (table)
+ * is actually a list of entries. All operations (adding, removing,
+ * etc.) need to find the right list of entries (here called "bucket")
+ * for a given key first, so it makes sense to do it in a separate
+ * function. The bucket may be in tab or newtab if resizing is taking
+ * place. Being informed by the caller if resizing is in progress,
+ * this func does not need to check for it by itself.
+ */
+static INLINE struct ht_node **find_bucket(hashtable_t *ht, const void *key,
+					   bool resizing_in_progress)
+{
+  size_t hash = hash2(ht->hashfunc(key)),
+    destination_tab_size, position;
+
+  struct ht_node **destination_tab;
+
+  if (resizing_in_progress)
+    /*
+     * Here we must check whether our key's bucket is still
+     * in ht->tab or already rehashed to ht->newtab.
+     */
+    {
+      size_t smaller_tab_size = min(ht->tab_size, ht->new_size),
+	smaller_tab_position = hash & (smaller_tab_size - 1);
+
+      if (smaller_tab_position < ht->rehashing_position)
+	{
+	  destination_tab = ht->newtab;
+	  destination_tab_size = ht->new_size;
+	}
+      else
+	{
+	  destination_tab = ht->tab;
+	  destination_tab_size = ht->tab_size;
+	}
+    }
+  else
+    /* In this case we know, we're working on ht->tab and not newtab. */
+    {
+      destination_tab = ht->tab;
+      destination_tab_size = ht->tab_size;
+    }
+
+  position = hash & (destination_tab_size - 1);
+  return &destination_tab[position];
+}
+
+/*
+ * Operations of adding, removing, etc. all work on list of entries
+ * (bucket) to wchich key hashes and they have some common logic, so
+ * it made sense to make a single function, that does the right
+ * operation based on an enum passed to it.
+ */
+static INLINE int
+perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket,
+			    const void *key, const void *val,
+			    void **keyptr, void **valptr,
+			    enum op op)
+{
+  struct ht_node **pairptr, *pair;
+
+  for (pairptr = bucket, pair = *pairptr;
+       pair;
+       pairptr = &pair->next, pair = pair->next)
+
+    if (!ht->cmpfunc(key, pair->key))
+      {
+	if (op == ADD)
+	  return HT_KEY_PRESENT;
+
+	if (keyptr) *keyptr = (void*) pair->key;
+	if (valptr) *valptr = (void*) pair->val;
+
+	switch (op)
+	  {
+	  case GET:
+	  case GET_THREADSAFE:
+	    {
+	      return HT_OK;
+	    }
+	  case SET:
+	    {
+	      pair->key = key;
+	      pair->val = val;
+	      return HT_OK;
+	    }
+	  default: /* case REM */
+	    {
+	      *pairptr = pair->next;
+	      free(pair);
+	      ht->entries--;
+	      return HT_OK;
+	    }
+	  }
+      }
+
+  if (op == GET || op == GET_THREADSAFE || op == REM)
+    return HT_KEY_ABSENT;
+
+  /* op == ADD || op == SET */
+
+  struct ht_node *new_pair = malloc(sizeof(struct ht_node));
+  if (!new_pair)
+    return HT_NO_MEM;
+
+  *new_pair = (struct ht_node) {.key = key, .val = val};
+  push(new_pair, bucket);
+  ht->entries++;
+
+  return HT_OK;
+}
+
+/* Generic function for performing of adding, removing, setting and getting. */
+static int perform_operation(hashtable_t *ht, const void *key, const void *val,
+			     void **keyptr, void **valptr, enum op op)
+{
+  bool resizing_in_progress;
+
+  if (op == GET_THREADSAFE) {
+    resizing_in_progress = resizing_taking_place(ht);
+    goto skip_resizing;
+  }
+
+  switch (do_resizing_related_stuff(ht, key, op))
+    {
+    case REHASHING_IN_PROGRESS:
+      resizing_in_progress = true;
+      break;
+    case REHASHING_NOT_IN_PROGRESS:
+      resizing_in_progress = false;
+      break;
+    default: /* case REHASHING_NO_MEM */
+      return HT_NO_MEM;
+    }
+
+  struct ht_node **bucket;
+
+ skip_resizing:
+  bucket = find_bucket(ht, key, resizing_in_progress);
+
+  return perform_operation_on_bucket(ht, bucket, key, val,
+				     keyptr, valptr, op);
+}
+
+/* The 5 functions below are the main part of the API. */
+int ht_get(hashtable_t *ht, const void *key,
+	   void **storedkey, void **val)
+{
+  return perform_operation(ht, key, NULL, storedkey, val, GET);
+}
+
+int ht_get_threadsafe(hashtable_t *ht, const void *key,
+		      void **storedkey, void **val)
+{
+  return perform_operation(ht, key, NULL, storedkey, val,
+			   GET_THREADSAFE);
+}
+
+int ht_add(hashtable_t *ht, const void *key, const void *val)
+{
+  return perform_operation(ht, key, val, NULL, NULL, ADD);
+}
+
+int ht_set(hashtable_t *ht, const void *key, const void *val,
+	   void **oldkey, void **oldval)
+{
+  return perform_operation(ht, key, val, oldkey, oldval, SET);
+}
+
+int ht_rem(hashtable_t *ht, const void *key,
+	   void **storedkey, void **val)
+{
+  return perform_operation(ht, key, NULL, storedkey, val, REM);
+}
+
+/*
+ * As mentioned in hashtable.h, this func does not deallocate keys
+ * nor vals. One could use ht_map_destroy() if that is needed.
+ */
+void ht_destroy(hashtable_t *ht)
+{
+  ssize_t position;
+
+  if (!ht->entries)
+    goto free_tab;
+
+  ht_finish_resizing(ht);
+
+  struct ht_node **tab = ht->tab;
+
+  for (position = ht->tab_size - 1; position >= 0; position--)
+    {
+      struct ht_node *pair = tab[position], *nextpair;
+
+      while (pair)
+	{
+	  nextpair = pair->next;
+	  free(pair);
+	  pair = nextpair;
+	}
+    }
+
+ free_tab:
+  free(ht->tab);
+}
+
+void ht_map(hashtable_t *ht, void *arg,
+	    void (*mapfunc)(const void *key, void *val, void *arg))
+{
+  ssize_t position;
+
+  if (!ht->entries)
+    return;
+
+  ht_finish_resizing(ht);
+
+  struct ht_node **tab = ht->tab, *pair;
+
+  for (position = ht->tab_size - 1; position >= 0; position--)
+    {
+      for (pair = tab[position]; pair; pair = pair->next)
+	mapfunc(pair->key, (void*) pair->val, arg);
+    }
+}
+
+void ht_map_destroy(hashtable_t *ht, void *arg,
+		    void (*mapfunc)(void *key, void *val, void *arg))
+{
+  /*
+   * If mapfunc() deallocates keys, the following 2 lines make
+   * assumption on ht_destroy(), that it doesn't call ht->hashfunc()
+   * or ht->cmpfunc() on keys.
+   */
+  ht_map(ht, arg, (void (*)(const void*, void*, void*)) mapfunc);
+  ht_destroy(ht);
+}
+
+/*
+ * These 2 functions are for easy making of hashtable with strings as
+ * keys. Note that this hash is *not* secure against DoS attacks.
+ */
+size_t ht_string_hash(const char *key)
+{
+  size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4;
+
+  do
+    {
+      char shift = ((unsigned char) key[i]) % sizeof(size_t);
+      hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift)))
+	^ key[i];
+    }
+  while (key[i++]);
+
+  return hash;
+}
+
+int ht_string_init(hashtable_t *ht)
+{
+  return ht_init(ht, (size_t (*)(const void*)) &ht_string_hash,
+		 (int (*)(const void*, const void*)) &strcmp);
+}