aboutsummaryrefslogtreecommitdiff
path: root/hashtable.c
diff options
context:
space:
mode:
Diffstat (limited to 'hashtable.c')
-rw-r--r--hashtable.c565
1 files changed, 565 insertions, 0 deletions
diff --git a/hashtable.c b/hashtable.c
new file mode 100644
index 0000000..a397879
--- /dev/null
+++ b/hashtable.c
@@ -0,0 +1,565 @@
+/**
+ * C hashtable implementation
+ *
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ */
+
+/*
+ * GENERAL INFO
+ *
+ * You might want to read the beginning of hashtable.h first.
+ *
+ * In some places "rehashing" and in other ones "resizing" seemed to
+ * be the right word to use. They mean more or less the same.
+ *
+ * Functions starting with ht_ are part of the API. Internal functions
+ * are declared static. I also made some of them inline (either
+ * because they were extremely short or only called from 1 place).
+ *
+ * Hashtable size is always a power of 2.
+ *
+ * When the hashtable is ¾ full, a new, 2x bigger table is allocated
+ * and whenever one of 4 basic operations (adding, removing, setting,
+ * getting) occurs, 4 slots are being rehashed from old table into 8
+ * slots in new table. Similarly, when hashtable is ¼ full, a new,
+ * 2x smaller table is allocated and each of subsequent operations
+ * rehashes 8 entries from old table into 4 in new table.
+ * This mechanism has been made lazier: getting and removing don't
+ * trigger growing of ht even if it's 3/4 full. Similarly, getting,
+ * setting and adding don't trigger shrinking.
+ * Once resizing is triggered, however, any of the operations will
+ * contribute to rehashing. Even if, for example, the operation is
+ * ADD and the table is being shrinked.
+ * This means, that if we have a hashtable of size n which is ¾ full
+ * and growing is triggered, then each subsequent call to
+ * ht_{add,rem,get,set}() rehashes some entries and, depending on
+ * how frequently and how successfully each of these 4 funcs was
+ * called, at the end of resizing we get a size 2n hashtable which is
+ * between ¼ and ½ full. Similarly, if shrinking of a ¼ full
+ * hashtable of size n is triggered, then after some operations we
+ * get a size ½n hashtable, that is somewhere between ¼ and ¾ full.
+ * One can see now, that we always keep the hashtable between ¼ and ¾
+ * full (with the exception of a minimal size one, that can be empty).
+ */
+
+#include "hashtable.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#ifdef _USE_INLINE
+#define INLINE inline
+#else
+#define INLINE
+#endif
+
+/*
+ * We won't shrink hashtable below this size. Newly created one will
+ * be this big.
+ */
+#define MIN_SIZE 4
+
+/* Special value of ht->rehashing_position. */
+#define NOT_REHASHING ((ssize_t) -1)
+
+/*
+ * Those are possible return values of do_resizing_related_stuff()
+ * and rehash_some_entries() (which only returns the first 2).
+ */
+#define REHASHING_IN_PROGRESS 0
+#define REHASHING_NOT_IN_PROGRESS 1
+#define REHASHING_NO_MEM 2
+
+/* Struct used to store a pair. */
+struct ht_node
+{
+ const void *key;
+ const void *val;
+ struct ht_node *next;
+};
+
+enum op
+ {
+ GET,
+ GET_THREADSAFE,
+ ADD,
+ SET,
+ REM,
+ };
+
+int ht_init(hashtable_t *ht,
+ size_t (*hash)(const void *key),
+ int (*cmp)(const void *key1, const void *key2))
+{
+ if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**))))
+ return HT_NO_MEM;
+
+ ht->tab_size = MIN_SIZE;
+ ht->rehashing_position = NOT_REHASHING;
+ ht->entries = 0;
+ ht->hashfunc = hash;
+ ht->cmpfunc = cmp;
+
+ return HT_OK;
+}
+
+/* First come some utilities :) */
+
+static INLINE size_t min(size_t n1, size_t n2)
+{
+ return n1 < n2 ? n1 : n2;
+}
+
+static INLINE size_t hash2(size_t n)
+{
+ /* I found this "hash improver" on the internet. */
+ n ^= (n >> 20) ^ (n >> 12);
+ return n ^ (n >> 7) ^ (n >> 4);
+}
+
+/* Below are 2 list-handling utility functions. */
+static INLINE struct ht_node *join_lists(struct ht_node *l1,
+ struct ht_node *l2)
+{
+ if (!l1) return l2;
+ if (!l2) return l1;
+
+ struct ht_node *l1_last;
+
+ for (l1_last = l1; l1_last->next; l1_last = l1_last->next);
+
+ /* Append l2 to the end of l1. */
+ l1_last->next = l2;
+
+ /* For convenience return the first element of the resulting list. */
+ return l1;
+}
+
+
+static INLINE void push(struct ht_node *node, struct ht_node **list)
+{
+ node->next = *list;
+ *list = node;
+}
+
+/*
+ * The following 2 rehash_* functions are helpers of rehash_some_entries().
+ * This func rehashes 1 chain of entries in tab[] into 2 chains in newtab[].
+ */
+static INLINE void rehash_position_growing(hashtable_t *ht)
+{
+ /* There are 2 possible new positions of an entry in a 2x bigger ht. */
+ struct ht_node *list0 = NULL, *list1 = NULL;
+
+ size_t old_position = ht->rehashing_position,
+ new_position0 = old_position,
+ new_position1 = old_position | ht->tab_size;
+
+ struct ht_node *pair = ht->tab[old_position], *next_pair;
+
+ while (pair)
+ {
+ next_pair = pair->next;
+
+ size_t new_position = hash2(ht->hashfunc(pair->key))
+ & (ht->new_size - 1);
+
+ push(pair, new_position == new_position1 ? &list1 : &list0);
+
+ pair = next_pair;
+ }
+
+ ht->newtab[new_position0] = list0;
+ ht->newtab[new_position1] = list1;
+
+ ht->rehashing_position++;
+}
+
+/* This func rehashes 2 chains of entries in tab[] into 1 chain in newtab[]. */
+static INLINE void rehash_2positions_shrinking(hashtable_t *ht)
+{
+ size_t new_position = ht->rehashing_position,
+ old_position0 = new_position,
+ old_position1 = new_position | ht->new_size;
+
+ ht->newtab[new_position] = join_lists(ht->tab[old_position0],
+ ht->tab[old_position1]);
+
+ ht->rehashing_position++;
+}
+
+/*
+ * Rehashes 4(8) positions from tab to newtab. If those were the last
+ * enetries to rehash, the function takes care of everything
+ * (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS.
+ * Otherwise, returns REHASHING_IN_PROGRESS.
+ * Caller must make sure rehashing was started b4 calling this func.
+ */
+static int rehash_some_entries(hashtable_t *ht)
+{
+ int rehashes_left = 4;
+
+ if (ht->new_size > ht->tab_size) /* growing ht */
+ {
+ while(rehashes_left--) rehash_position_growing(ht);
+ if (ht->rehashing_position != ht->tab_size)
+ return REHASHING_IN_PROGRESS;
+ }
+ else /* shrinking ht */
+ {
+ while(rehashes_left--) rehash_2positions_shrinking(ht);
+ if (ht->rehashing_position != ht->new_size)
+ return REHASHING_IN_PROGRESS;
+ }
+
+ /* rehashing finishes */
+ ht->rehashing_position = NOT_REHASHING;
+ ht->tab_size = ht->new_size;
+ free(ht->tab);
+ ht->tab = ht->newtab;
+
+ return REHASHING_NOT_IN_PROGRESS;
+}
+
+static INLINE bool resizing_taking_place(hashtable_t *ht)
+{
+ return !(ht->rehashing_position == NOT_REHASHING);
+}
+
+void ht_finish_resizing(hashtable_t *ht)
+{
+ if (resizing_taking_place(ht))
+ while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS);
+}
+
+static INLINE bool needs_growing(hashtable_t *ht)
+{
+ return ht->entries >= 3 * ht->tab_size / 4;
+}
+
+static INLINE bool needs_shrinking(hashtable_t *ht)
+{
+ return ht->tab_size > MIN_SIZE
+ && ht->entries <= ht->tab_size / 4;
+}
+/*
+ * Each of hashtable operations (add, set, rem, get) should also
+ * attempt to do part of resizing. This way resizing operation
+ * which is O(n) is distributed among many hashtable accesses
+ * each of them still being O(1). Without this the the amortized
+ * complexity of ht accesses would still be O(1), but a single access
+ * would sometimes be O(n).
+ * Other function that adds, sets, gets or removes sth from ht uses
+ * this one to do this "part of resizing" mentioned above.
+ * This func returns REHASHING_NO_MEM on failed malloc (won't happen
+ * for GET or REM operation) and REHASHING_[NOT_]IN_PROGRESS otherwise.
+ */
+static INLINE int
+do_resizing_related_stuff(hashtable_t *ht, const void *key, enum op op)
+{
+ bool resizing = resizing_taking_place(ht);
+
+ if (!resizing)
+ {
+ size_t new_size;
+
+ switch (op)
+ {
+ case GET:
+ goto dont_start_resizing;
+ case ADD:
+ case SET:
+ if (needs_growing(ht))
+ new_size = ht->tab_size * 2;
+ else
+ goto dont_start_resizing;
+ break;
+ default: /* case REM */
+ if (needs_shrinking(ht))
+ new_size = ht->tab_size / 2;
+ else
+ goto dont_start_resizing;
+ }
+
+ struct ht_node **newtab;
+ if (!(newtab = malloc(new_size * sizeof(struct ht_node*))))
+ return op == REM ? REHASHING_NOT_IN_PROGRESS : REHASHING_NO_MEM;
+
+ ht->newtab = newtab;
+ ht->new_size = new_size;
+ ht->rehashing_position = 0;
+
+ resizing = true;
+ }
+
+ dont_start_resizing:
+
+ return resizing ?
+ rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS;
+}
+
+/*
+ * This is a chaining hashtable, so each element in the array (table)
+ * is actually a list of entries. All operations (adding, removing,
+ * etc.) need to find the right list of entries (here called "bucket")
+ * for a given key first, so it makes sense to do it in a separate
+ * function. The bucket may be in tab or newtab if resizing is taking
+ * place. Being informed by the caller if resizing is in progress,
+ * this func does not need to check for it by itself.
+ */
+static INLINE struct ht_node **find_bucket(hashtable_t *ht, const void *key,
+ bool resizing_in_progress)
+{
+ size_t hash = hash2(ht->hashfunc(key)),
+ destination_tab_size, position;
+
+ struct ht_node **destination_tab;
+
+ if (resizing_in_progress)
+ /*
+ * Here we must check whether our key's bucket is still
+ * in ht->tab or already rehashed to ht->newtab.
+ */
+ {
+ size_t smaller_tab_size = min(ht->tab_size, ht->new_size),
+ smaller_tab_position = hash & (smaller_tab_size - 1);
+
+ if (smaller_tab_position < ht->rehashing_position)
+ {
+ destination_tab = ht->newtab;
+ destination_tab_size = ht->new_size;
+ }
+ else
+ {
+ destination_tab = ht->tab;
+ destination_tab_size = ht->tab_size;
+ }
+ }
+ else
+ /* In this case we know, we're working on ht->tab and not newtab. */
+ {
+ destination_tab = ht->tab;
+ destination_tab_size = ht->tab_size;
+ }
+
+ position = hash & (destination_tab_size - 1);
+ return &destination_tab[position];
+}
+
+/*
+ * Operations of adding, removing, etc. all work on list of entries
+ * (bucket) to wchich key hashes and they have some common logic, so
+ * it made sense to make a single function, that does the right
+ * operation based on an enum passed to it.
+ */
+static INLINE int
+perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket,
+ const void *key, const void *val,
+ void **keyptr, void **valptr,
+ enum op op)
+{
+ struct ht_node **pairptr, *pair;
+
+ for (pairptr = bucket, pair = *pairptr;
+ pair;
+ pairptr = &pair->next, pair = pair->next)
+
+ if (!ht->cmpfunc(key, pair->key))
+ {
+ if (op == ADD)
+ return HT_KEY_PRESENT;
+
+ if (keyptr) *keyptr = (void*) pair->key;
+ if (valptr) *valptr = (void*) pair->val;
+
+ switch (op)
+ {
+ case GET:
+ case GET_THREADSAFE:
+ {
+ return HT_OK;
+ }
+ case SET:
+ {
+ pair->key = key;
+ pair->val = val;
+ return HT_OK;
+ }
+ default: /* case REM */
+ {
+ *pairptr = pair->next;
+ free(pair);
+ ht->entries--;
+ return HT_OK;
+ }
+ }
+ }
+
+ if (op == GET || op == GET_THREADSAFE || op == REM)
+ return HT_KEY_ABSENT;
+
+ /* op == ADD || op == SET */
+
+ struct ht_node *new_pair = malloc(sizeof(struct ht_node));
+ if (!new_pair)
+ return HT_NO_MEM;
+
+ *new_pair = (struct ht_node) {.key = key, .val = val};
+ push(new_pair, bucket);
+ ht->entries++;
+
+ return HT_OK;
+}
+
+/* Generic function for performing of adding, removing, setting and getting. */
+static int perform_operation(hashtable_t *ht, const void *key, const void *val,
+ void **keyptr, void **valptr, enum op op)
+{
+ bool resizing_in_progress;
+
+ if (op == GET_THREADSAFE) {
+ resizing_in_progress = resizing_taking_place(ht);
+ goto skip_resizing;
+ }
+
+ switch (do_resizing_related_stuff(ht, key, op))
+ {
+ case REHASHING_IN_PROGRESS:
+ resizing_in_progress = true;
+ break;
+ case REHASHING_NOT_IN_PROGRESS:
+ resizing_in_progress = false;
+ break;
+ default: /* case REHASHING_NO_MEM */
+ return HT_NO_MEM;
+ }
+
+ struct ht_node **bucket;
+
+ skip_resizing:
+ bucket = find_bucket(ht, key, resizing_in_progress);
+
+ return perform_operation_on_bucket(ht, bucket, key, val,
+ keyptr, valptr, op);
+}
+
+/* The 5 functions below are the main part of the API. */
+int ht_get(hashtable_t *ht, const void *key,
+ void **storedkey, void **val)
+{
+ return perform_operation(ht, key, NULL, storedkey, val, GET);
+}
+
+int ht_get_threadsafe(hashtable_t *ht, const void *key,
+ void **storedkey, void **val)
+{
+ return perform_operation(ht, key, NULL, storedkey, val,
+ GET_THREADSAFE);
+}
+
+int ht_add(hashtable_t *ht, const void *key, const void *val)
+{
+ return perform_operation(ht, key, val, NULL, NULL, ADD);
+}
+
+int ht_set(hashtable_t *ht, const void *key, const void *val,
+ void **oldkey, void **oldval)
+{
+ return perform_operation(ht, key, val, oldkey, oldval, SET);
+}
+
+int ht_rem(hashtable_t *ht, const void *key,
+ void **storedkey, void **val)
+{
+ return perform_operation(ht, key, NULL, storedkey, val, REM);
+}
+
+/*
+ * As mentioned in hashtable.h, this func does not deallocate keys
+ * nor vals. One could use ht_map_destroy() if that is needed.
+ */
+void ht_destroy(hashtable_t *ht)
+{
+ ssize_t position;
+
+ if (!ht->entries)
+ goto free_tab;
+
+ ht_finish_resizing(ht);
+
+ struct ht_node **tab = ht->tab;
+
+ for (position = ht->tab_size - 1; position >= 0; position--)
+ {
+ struct ht_node *pair = tab[position], *nextpair;
+
+ while (pair)
+ {
+ nextpair = pair->next;
+ free(pair);
+ pair = nextpair;
+ }
+ }
+
+ free_tab:
+ free(ht->tab);
+}
+
+void ht_map(hashtable_t *ht, void *arg,
+ void (*mapfunc)(const void *key, void *val, void *arg))
+{
+ ssize_t position;
+
+ if (!ht->entries)
+ return;
+
+ ht_finish_resizing(ht);
+
+ struct ht_node **tab = ht->tab, *pair;
+
+ for (position = ht->tab_size - 1; position >= 0; position--)
+ {
+ for (pair = tab[position]; pair; pair = pair->next)
+ mapfunc(pair->key, (void*) pair->val, arg);
+ }
+}
+
+void ht_map_destroy(hashtable_t *ht, void *arg,
+ void (*mapfunc)(void *key, void *val, void *arg))
+{
+ /*
+ * If mapfunc() deallocates keys, the following 2 lines make
+ * assumption on ht_destroy(), that it doesn't call ht->hashfunc()
+ * or ht->cmpfunc() on keys.
+ */
+ ht_map(ht, arg, (void (*)(const void*, void*, void*)) mapfunc);
+ ht_destroy(ht);
+}
+
+/*
+ * These 2 functions are for easy making of hashtable with strings as
+ * keys. Note that this hash is *not* secure against DoS attacks.
+ */
+size_t ht_string_hash(const char *key)
+{
+ size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4;
+
+ do
+ {
+ char shift = ((unsigned char) key[i]) % sizeof(size_t);
+ hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift)))
+ ^ key[i];
+ }
+ while (key[i++]);
+
+ return hash;
+}
+
+int ht_string_init(hashtable_t *ht)
+{
+ return ht_init(ht, (size_t (*)(const void*)) &ht_string_hash,
+ (int (*)(const void*, const void*)) &strcmp);
+}