aboutsummaryrefslogtreecommitdiff
path: root/hashtable.c
diff options
context:
space:
mode:
authorWojtek Kosior <kwojtus@protonmail.com>2019-07-23 13:47:34 +0200
committerWojtek Kosior <kwojtus@protonmail.com>2019-07-23 13:47:34 +0200
commit4d7361009863a0ea2bd75919a79574cdc213c72b (patch)
tree9ac6eed1af375b66f12fd6f776f71201b6e69403 /hashtable.c
downloadC-hashtable-4d7361009863a0ea2bd75919a79574cdc213c72b.tar.gz
C-hashtable-4d7361009863a0ea2bd75919a79574cdc213c72b.zip
initial commit
Diffstat (limited to 'hashtable.c')
-rw-r--r--hashtable.c502
1 files changed, 502 insertions, 0 deletions
diff --git a/hashtable.c b/hashtable.c
new file mode 100644
index 0000000..c2dc582
--- /dev/null
+++ b/hashtable.c
@@ -0,0 +1,502 @@
+// GENERAL INFO
+
+// You might want to read the beginning of hashtable.h first.
+
+// In some places "rehashing" and in other ones "resizing" seemed to
+// be the right word to use. They mean more or less the same.
+
+// Functions starting with ht_ are part of the API. Internal functions
+// are declared static. I also made some of them inline (either
+// because they were extremely short or only called from 1 place).
+
+// Hashtable size is always a power of 2.
+
+// When the hashtable is ¾ full, a new, 2x bigger table is allocated
+// and whenever one of 4 basic operations (adding, removing, setting,
+// getting) occurs, 4 slots are being rehashed from old table into 8
+// slots in new table. Similarly, when hashtable is ¼ full, a new,
+// 2x smaller table is allocated and each of subsequent operations
+// rehashes 8 entries from old table into 4 in new table.
+// This mechanism has been made lazier: getting and removing don't
+// trigger growing of ht even if it's 3/4 full. Similarly, getting,
+// setting and adding don't trigger shrinking.
+// Once resizing is triggered, however, any of the operations will
+// contribute to rehashing. Even if, for example, the operation is
+// ADD and the table is being shrinked.
+// This means, that if we have a hashtable of size n which is ¾ full
+// and growing is triggered, then each subsequent call to
+// ht_{add,rem,get,set}() rehashes some entries and, depending on
+// how frequently and how successfully each of these 4 funcs was
+// called, at the end of resizing we get a size 2n hashtable which is
+// between ¼ and ½ full. Similarly, if shrinking of a ¼ full
+// hashtable of size n is triggered, then after some operations we
+// get a size ½n hashtable, that is somewhere between ¼ and ¾ full.
+// One can see now, that we always keep the hashtable between ¼ and ¾
+// full (with the exception of a minimal size one, that can be empty).
+
+#include "hashtable.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+// We won't shrink hashtable below this size. Newly created one will
+// be this big.
+#define MIN_SIZE 32
+
+// Special value of ht->rehashing_position.
+#define NOT_REHASHING ((ssize_t) -1)
+
+// Those are possible return values of do_resizing_related_stuff()
+// and rehash_some_entries() (which only returns the first 2).
+#define REHASHING_IN_PROGRESS 0
+#define REHASHING_NOT_IN_PROGRESS 1
+#define REHASHING_NO_MEM 2
+
+enum op
+ {
+ GET,
+ ADD,
+ SET,
+ REM,
+ };
+
+int ht_init(hashtable_t *ht,
+ size_t (*hash)(void *key),
+ int (*cmp)(void *key1, void *key2))
+{
+ if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**))))
+ return HT_NO_MEM;
+
+ ht->tab_size = MIN_SIZE;
+ ht->rehashing_position = NOT_REHASHING;
+ ht->entries = 0;
+ ht->hashfunc = hash;
+ ht->cmpfunc = cmp;
+
+ return HT_OK;
+}
+
+// First come some utilities :)
+
+static inline size_t min(size_t n1, size_t n2)
+{
+ return n1 < n2 ? n1 : n2;
+}
+
+static inline size_t hash2(size_t n)
+{
+ // I found this "hash improver" on the internet.
+ n ^= (n >> 20) ^ (n >> 12);
+ return n ^ (n >> 7) ^ (n >> 4);
+}
+
+// Below are 2 list-handling utility functions.
+static inline struct ht_node *join_lists(struct ht_node *l1,
+ struct ht_node *l2)
+{
+ if (!l1) return l2;
+ if (!l2) return l1;
+
+ struct ht_node *l1_last;
+
+ for (l1_last = l1; l1_last->next; l1_last = l1_last->next);
+
+ // Append l2 to the end of l1.
+ l1_last->next = l2;
+
+ // For convenience return the first element of the resulting list.
+ return l1;
+}
+
+
+static inline void push(struct ht_node *node, struct ht_node **list)
+{
+ node->next = *list;
+ *list = node;
+}
+
+// The following 2 rehash_* functions are helpers of
+// rehash_some_entries().
+// This func rehashes 1 chain of entries in tab[]
+// into 2 chains in newtab[].
+static inline void rehash_position_growing(hashtable_t *ht)
+{
+ // There are 2 possible new positions of an entry in a 2x bigger ht.
+ struct ht_node *list0 = NULL, *list1 = NULL;
+
+ size_t old_position = ht->rehashing_position,
+ new_position0 = old_position,
+ new_position1 = old_position | ht->tab_size;
+
+ struct ht_node *pair = ht->tab[old_position], *next_pair;
+
+ while (pair)
+ {
+ next_pair = pair->next;
+
+ size_t new_position = hash2(ht->hashfunc(pair->key))
+ & (ht->new_size - 1);
+
+ push(pair, new_position == new_position1 ? &list1 : &list0);
+
+ pair = next_pair;
+ }
+
+ ht->newtab[new_position0] = list0;
+ ht->newtab[new_position1] = list1;
+
+ ht->rehashing_position++;
+}
+
+// This func rehashes 2 chains of entries in tab[]
+// into 1 chain in newtab[].
+static inline void rehash_2positions_shrinking(hashtable_t *ht)
+{
+ size_t new_position = ht->rehashing_position,
+ old_position0 = new_position,
+ old_position1 = new_position | ht->new_size;
+
+ ht->newtab[new_position] = join_lists(ht->tab[old_position0],
+ ht->tab[old_position1]);
+
+ ht->rehashing_position++;
+}
+
+// Rehashes 4(8) positions from tab to newtab. If those were the last
+// enetries to rehash, the function takes care of everything
+// (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS.
+// Otherwise, returns REHASHING_IN_PROGRESS.
+// Caller must make sure rehashing was started b4 calling this func.
+static int rehash_some_entries(hashtable_t *ht)
+{
+ int rehashes_left = 4;
+
+ if (ht->new_size > ht->tab_size) // growing ht
+ {
+ while(rehashes_left--) rehash_position_growing(ht);
+ if (ht->rehashing_position != ht->tab_size)
+ return REHASHING_IN_PROGRESS;
+ }
+ else // shrinking ht
+ {
+ while(rehashes_left--) rehash_2positions_shrinking(ht);
+ if (ht->rehashing_position != ht->new_size)
+ return REHASHING_IN_PROGRESS;
+ }
+
+ // rehashing finishes
+ ht->rehashing_position = NOT_REHASHING;
+ ht->tab_size = ht->new_size;
+ free(ht->tab);
+ ht->tab = ht->newtab;
+
+ return REHASHING_NOT_IN_PROGRESS;
+}
+
+static inline bool resizing_taking_place(hashtable_t *ht)
+{
+ return !(ht->rehashing_position == NOT_REHASHING);
+}
+
+void ht_finish_resizing(hashtable_t *ht)
+{
+ if (resizing_taking_place(ht))
+ while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS);
+}
+
+static inline bool needs_growing(hashtable_t *ht)
+{
+ return ht->entries == 3 * ht->tab_size / 4;
+}
+
+static inline bool needs_shrinking(hashtable_t *ht)
+{
+ return ht->tab_size > MIN_SIZE
+ && ht->entries == ht->tab_size / 4;
+}
+
+// Each of hashtable operations (add, set, rem, get) should also
+// attempt to do part of resizing. This way resizing operation
+// which is O(n) is distributed among many hashtable accesses
+// each of them still being O(1). Without this the the amortized
+// complexity of ht accesses would still be O(1), but a single access
+// would sometimes be O(n).
+// Other function that adds, sets, gets or removes sth from ht uses
+// this one to do this "part of resizing" mentioned above.
+// This func returns REHASHING_NO_MEM on failed malloc (won't happen
+// for GET operation) and REHASHING_(NOT_)IN_PROGRESS otherwise.
+static inline int do_resizing_related_stuff(hashtable_t *ht,
+ void *key, enum op op)
+{
+ bool resizing = resizing_taking_place(ht);
+
+ if (!resizing)
+ {
+ size_t new_size;
+
+ switch (op)
+ {
+ case GET:
+ goto dont_start_resizing;
+ case ADD:
+ case SET:
+ if (needs_growing(ht))
+ new_size = ht->tab_size * 2;
+ else
+ goto dont_start_resizing;
+ break;
+ default: // case REM
+ if (needs_shrinking(ht))
+ new_size = ht->tab_size / 2;
+ else
+ goto dont_start_resizing;
+ }
+
+ struct ht_node **newtab;
+ if (!(newtab = malloc(new_size * sizeof(struct ht_node*))))
+ return REHASHING_NO_MEM;
+
+ ht->newtab = newtab;
+ ht->new_size = new_size;
+ ht->rehashing_position = 0;
+
+ resizing = true;
+ }
+
+ dont_start_resizing:
+
+ return resizing ?
+ rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS;
+}
+
+// This is a chaining hashtable, so each element in the array (table)
+// is actually a list of entries. All operations (adding, removing,
+// etc.) need to find the right list of entries (here called "bucket")
+// for a given key first, so it makes sense to do it in a separate
+// function. The bucket may be in tab or newtab if resizing is taking
+// place. Being informed by the caller if resizing is in progress,
+// this func does not need to check for it itself.
+static inline struct ht_node **find_bucket(hashtable_t *ht, void *key,
+ bool resizing_in_progress)
+{
+ size_t hash = hash2(ht->hashfunc(key)),
+ destination_tab_size, position;
+
+ struct ht_node **destination_tab;
+
+ if (resizing_in_progress)
+ // Here we must check whether our key's bucket is still
+ // in ht->tab or already rehashed to ht->newtab.
+ {
+ size_t smaller_tab_size = min(ht->tab_size, ht->new_size),
+ smaller_tab_position = hash & (smaller_tab_size - 1);
+
+ if (smaller_tab_position < ht->rehashing_position)
+ {
+ destination_tab = ht->newtab;
+ destination_tab_size = ht->new_size;
+ }
+ else
+ {
+ destination_tab = ht->tab;
+ destination_tab_size = ht->tab_size;
+ }
+ }
+ else
+ // In this case we know, we're working on ht->tab and not newtab.
+ {
+ destination_tab = ht->tab;
+ destination_tab_size = ht->tab_size;
+ }
+
+ position = hash & (destination_tab_size - 1);
+ return &destination_tab[position];
+}
+
+// Operations of adding, removing, etc. all work on list of entries
+// (bucket) to wchich key hashes and they have some common logic, so
+// it made sense to make a single function, that does the right
+// operation based on an enum passed to it.
+static inline int
+perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket,
+ void *key, void *val,
+ void **keyptr, void **valptr,
+ enum op op)
+{
+ for (struct ht_node **pairptr = bucket, *pair = *pairptr;
+ pair;
+ pairptr = &pair->next, pair = pair->next)
+
+ if (!ht->cmpfunc(key, pair->key))
+ {
+ if (op == ADD)
+ return HT_KEY_PRESENT;
+
+ if (keyptr) *keyptr = pair->key;
+ if (valptr) *valptr = pair->val;
+
+ switch (op)
+ {
+ case GET:
+ {
+ return HT_OK;
+ }
+ case SET:
+ {
+ pair->key = key;
+ pair->val = val;
+ return HT_OK;
+ }
+ default: // case REM
+ {
+ *pairptr = pair->next;
+ free(pair);
+ ht->entries--;
+ return HT_OK;
+ }
+ }
+ }
+
+ if (op == GET || op == REM)
+ return HT_KEY_ABSENT;
+
+ // op == ADD || op == SET
+
+ struct ht_node *new_pair = malloc(sizeof(struct ht_node));
+ if (!new_pair)
+ return HT_NO_MEM;
+
+ *new_pair = (struct ht_node) {.key = key, .val = val};
+ push(new_pair, bucket);
+ ht->entries++;
+
+ return HT_OK;
+}
+
+// Generic function for performing of adding, removing, setting and
+// getting.
+static int perform_operation(hashtable_t *ht, void *key, void *val,
+ void **keyptr, void **valptr, enum op op)
+{
+ bool resizing_in_progress;
+
+ switch (do_resizing_related_stuff(ht, key, op))
+ {
+ case REHASHING_IN_PROGRESS:
+ resizing_in_progress = true;
+ break;
+ case REHASHING_NOT_IN_PROGRESS:
+ resizing_in_progress = false;
+ break;
+ default: // case REHASHING_NO_MEM
+ return HT_NO_MEM;
+ }
+
+ struct ht_node **bucket =
+ find_bucket(ht, key, resizing_in_progress);
+
+ return perform_operation_on_bucket(ht, bucket, key, val,
+ keyptr, valptr, op);
+}
+
+// The 4 functions below are the main part of the API.
+int ht_get(hashtable_t *ht, void *key,
+ void **storedkey, void **val)
+{
+ return perform_operation(ht, key, NULL, storedkey, val, GET);
+}
+
+int ht_add(hashtable_t *ht, void *key, void *val)
+{
+ return perform_operation(ht, key, val, NULL, NULL, ADD);
+}
+
+int ht_set(hashtable_t *ht, void *key, void *val,
+ void **oldkey, void **oldval)
+{
+ return perform_operation(ht, key, val, oldkey, oldval, SET);
+}
+
+int ht_rem(hashtable_t *ht, void *key,
+ void **storedkey, void **val)
+{
+ return perform_operation(ht, key, NULL, storedkey, val, REM);
+}
+
+// As mentioned in hashtable.h, this func does not deallocate keys
+// nor vals. One could use ht_map() to free() them before calling this
+// func.
+void ht_destroy(hashtable_t *ht)
+{
+ if (!ht->entries) return;
+ ht_finish_resizing(ht);
+
+ struct ht_node **tab = ht->tab;
+
+ for (ssize_t position = ht->tab_size - 1;
+ position >= 0; position--)
+ {
+ struct ht_node *pair = tab[position], *nextpair;
+
+ while (pair)
+ {
+ nextpair = pair->next;
+ free(pair);
+ pair = nextpair;
+ }
+ }
+
+ free(ht->tab);
+}
+
+void ht_map(hashtable_t *ht, void *arg,
+ void (*mapfunc)(void *key, void *val, void *arg))
+{
+ if (!ht->entries) return;
+ ht_finish_resizing(ht);
+
+ struct ht_node **tab = ht->tab;
+
+ for (ssize_t position = ht->tab_size - 1;
+ position >= 0; position--)
+
+ for (struct ht_node *pair = tab[position];
+ pair; pair = pair->next)
+
+ mapfunc(pair->key, pair->val, arg);
+}
+
+void ht_map_destroy(hashtable_t *ht, void *arg,
+ void (*mapfunc)(void *key, void *val, void *arg))
+{
+ // If mapfunc() deallocates keys, then the next 2 lines require
+ // assumption on ht_destroy(), that it doesn't call ht->hashfunc()
+ // or ht->cmpfunc() on keys.
+ ht_map(ht, arg, mapfunc);
+ ht_destroy(ht);
+}
+
+// These 2 functions are for easy making of hashtable with strings as
+// keys.
+size_t ht_string_hash(const char *key)
+{
+ size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4;
+
+ do
+ {
+ char shift = ((unsigned char) key[i]) % sizeof(size_t);
+ hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift)))
+ ^ key[i];
+ }
+ while (key[i++]);
+
+ return hash;
+}
+
+int ht_string_init(hashtable_t *ht)
+{
+ return ht_init(ht, (size_t (*)(void*)) &ht_string_hash,
+ (int (*)(void*, void*)) &strcmp);
+}