diff options
Diffstat (limited to 'hashtable.c')
-rw-r--r-- | hashtable.c | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/hashtable.c b/hashtable.c new file mode 100644 index 0000000..a397879 --- /dev/null +++ b/hashtable.c @@ -0,0 +1,565 @@ +/** + * C hashtable implementation + * + * Copyright (C) 2021 Wojtek Kosior + * Redistribution terms are gathered in the `copyright' file. + */ + +/* + * GENERAL INFO + * + * You might want to read the beginning of hashtable.h first. + * + * In some places "rehashing" and in other ones "resizing" seemed to + * be the right word to use. They mean more or less the same. + * + * Functions starting with ht_ are part of the API. Internal functions + * are declared static. I also made some of them inline (either + * because they were extremely short or only called from 1 place). + * + * Hashtable size is always a power of 2. + * + * When the hashtable is ¾ full, a new, 2x bigger table is allocated + * and whenever one of 4 basic operations (adding, removing, setting, + * getting) occurs, 4 slots are being rehashed from old table into 8 + * slots in new table. Similarly, when hashtable is ¼ full, a new, + * 2x smaller table is allocated and each of subsequent operations + * rehashes 8 entries from old table into 4 in new table. + * This mechanism has been made lazier: getting and removing don't + * trigger growing of ht even if it's 3/4 full. Similarly, getting, + * setting and adding don't trigger shrinking. + * Once resizing is triggered, however, any of the operations will + * contribute to rehashing. Even if, for example, the operation is + * ADD and the table is being shrinked. + * This means, that if we have a hashtable of size n which is ¾ full + * and growing is triggered, then each subsequent call to + * ht_{add,rem,get,set}() rehashes some entries and, depending on + * how frequently and how successfully each of these 4 funcs was + * called, at the end of resizing we get a size 2n hashtable which is + * between ¼ and ½ full. Similarly, if shrinking of a ¼ full + * hashtable of size n is triggered, then after some operations we + * get a size ½n hashtable, that is somewhere between ¼ and ¾ full. + * One can see now, that we always keep the hashtable between ¼ and ¾ + * full (with the exception of a minimal size one, that can be empty). + */ + +#include "hashtable.h" + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <stdbool.h> + +#ifdef _USE_INLINE +#define INLINE inline +#else +#define INLINE +#endif + +/* + * We won't shrink hashtable below this size. Newly created one will + * be this big. + */ +#define MIN_SIZE 4 + +/* Special value of ht->rehashing_position. */ +#define NOT_REHASHING ((ssize_t) -1) + +/* + * Those are possible return values of do_resizing_related_stuff() + * and rehash_some_entries() (which only returns the first 2). + */ +#define REHASHING_IN_PROGRESS 0 +#define REHASHING_NOT_IN_PROGRESS 1 +#define REHASHING_NO_MEM 2 + +/* Struct used to store a pair. */ +struct ht_node +{ + const void *key; + const void *val; + struct ht_node *next; +}; + +enum op + { + GET, + GET_THREADSAFE, + ADD, + SET, + REM, + }; + +int ht_init(hashtable_t *ht, + size_t (*hash)(const void *key), + int (*cmp)(const void *key1, const void *key2)) +{ + if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**)))) + return HT_NO_MEM; + + ht->tab_size = MIN_SIZE; + ht->rehashing_position = NOT_REHASHING; + ht->entries = 0; + ht->hashfunc = hash; + ht->cmpfunc = cmp; + + return HT_OK; +} + +/* First come some utilities :) */ + +static INLINE size_t min(size_t n1, size_t n2) +{ + return n1 < n2 ? n1 : n2; +} + +static INLINE size_t hash2(size_t n) +{ + /* I found this "hash improver" on the internet. */ + n ^= (n >> 20) ^ (n >> 12); + return n ^ (n >> 7) ^ (n >> 4); +} + +/* Below are 2 list-handling utility functions. */ +static INLINE struct ht_node *join_lists(struct ht_node *l1, + struct ht_node *l2) +{ + if (!l1) return l2; + if (!l2) return l1; + + struct ht_node *l1_last; + + for (l1_last = l1; l1_last->next; l1_last = l1_last->next); + + /* Append l2 to the end of l1. */ + l1_last->next = l2; + + /* For convenience return the first element of the resulting list. */ + return l1; +} + + +static INLINE void push(struct ht_node *node, struct ht_node **list) +{ + node->next = *list; + *list = node; +} + +/* + * The following 2 rehash_* functions are helpers of rehash_some_entries(). + * This func rehashes 1 chain of entries in tab[] into 2 chains in newtab[]. + */ +static INLINE void rehash_position_growing(hashtable_t *ht) +{ + /* There are 2 possible new positions of an entry in a 2x bigger ht. */ + struct ht_node *list0 = NULL, *list1 = NULL; + + size_t old_position = ht->rehashing_position, + new_position0 = old_position, + new_position1 = old_position | ht->tab_size; + + struct ht_node *pair = ht->tab[old_position], *next_pair; + + while (pair) + { + next_pair = pair->next; + + size_t new_position = hash2(ht->hashfunc(pair->key)) + & (ht->new_size - 1); + + push(pair, new_position == new_position1 ? &list1 : &list0); + + pair = next_pair; + } + + ht->newtab[new_position0] = list0; + ht->newtab[new_position1] = list1; + + ht->rehashing_position++; +} + +/* This func rehashes 2 chains of entries in tab[] into 1 chain in newtab[]. */ +static INLINE void rehash_2positions_shrinking(hashtable_t *ht) +{ + size_t new_position = ht->rehashing_position, + old_position0 = new_position, + old_position1 = new_position | ht->new_size; + + ht->newtab[new_position] = join_lists(ht->tab[old_position0], + ht->tab[old_position1]); + + ht->rehashing_position++; +} + +/* + * Rehashes 4(8) positions from tab to newtab. If those were the last + * enetries to rehash, the function takes care of everything + * (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS. + * Otherwise, returns REHASHING_IN_PROGRESS. + * Caller must make sure rehashing was started b4 calling this func. + */ +static int rehash_some_entries(hashtable_t *ht) +{ + int rehashes_left = 4; + + if (ht->new_size > ht->tab_size) /* growing ht */ + { + while(rehashes_left--) rehash_position_growing(ht); + if (ht->rehashing_position != ht->tab_size) + return REHASHING_IN_PROGRESS; + } + else /* shrinking ht */ + { + while(rehashes_left--) rehash_2positions_shrinking(ht); + if (ht->rehashing_position != ht->new_size) + return REHASHING_IN_PROGRESS; + } + + /* rehashing finishes */ + ht->rehashing_position = NOT_REHASHING; + ht->tab_size = ht->new_size; + free(ht->tab); + ht->tab = ht->newtab; + + return REHASHING_NOT_IN_PROGRESS; +} + +static INLINE bool resizing_taking_place(hashtable_t *ht) +{ + return !(ht->rehashing_position == NOT_REHASHING); +} + +void ht_finish_resizing(hashtable_t *ht) +{ + if (resizing_taking_place(ht)) + while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS); +} + +static INLINE bool needs_growing(hashtable_t *ht) +{ + return ht->entries >= 3 * ht->tab_size / 4; +} + +static INLINE bool needs_shrinking(hashtable_t *ht) +{ + return ht->tab_size > MIN_SIZE + && ht->entries <= ht->tab_size / 4; +} +/* + * Each of hashtable operations (add, set, rem, get) should also + * attempt to do part of resizing. This way resizing operation + * which is O(n) is distributed among many hashtable accesses + * each of them still being O(1). Without this the the amortized + * complexity of ht accesses would still be O(1), but a single access + * would sometimes be O(n). + * Other function that adds, sets, gets or removes sth from ht uses + * this one to do this "part of resizing" mentioned above. + * This func returns REHASHING_NO_MEM on failed malloc (won't happen + * for GET or REM operation) and REHASHING_[NOT_]IN_PROGRESS otherwise. + */ +static INLINE int +do_resizing_related_stuff(hashtable_t *ht, const void *key, enum op op) +{ + bool resizing = resizing_taking_place(ht); + + if (!resizing) + { + size_t new_size; + + switch (op) + { + case GET: + goto dont_start_resizing; + case ADD: + case SET: + if (needs_growing(ht)) + new_size = ht->tab_size * 2; + else + goto dont_start_resizing; + break; + default: /* case REM */ + if (needs_shrinking(ht)) + new_size = ht->tab_size / 2; + else + goto dont_start_resizing; + } + + struct ht_node **newtab; + if (!(newtab = malloc(new_size * sizeof(struct ht_node*)))) + return op == REM ? REHASHING_NOT_IN_PROGRESS : REHASHING_NO_MEM; + + ht->newtab = newtab; + ht->new_size = new_size; + ht->rehashing_position = 0; + + resizing = true; + } + + dont_start_resizing: + + return resizing ? + rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS; +} + +/* + * This is a chaining hashtable, so each element in the array (table) + * is actually a list of entries. All operations (adding, removing, + * etc.) need to find the right list of entries (here called "bucket") + * for a given key first, so it makes sense to do it in a separate + * function. The bucket may be in tab or newtab if resizing is taking + * place. Being informed by the caller if resizing is in progress, + * this func does not need to check for it by itself. + */ +static INLINE struct ht_node **find_bucket(hashtable_t *ht, const void *key, + bool resizing_in_progress) +{ + size_t hash = hash2(ht->hashfunc(key)), + destination_tab_size, position; + + struct ht_node **destination_tab; + + if (resizing_in_progress) + /* + * Here we must check whether our key's bucket is still + * in ht->tab or already rehashed to ht->newtab. + */ + { + size_t smaller_tab_size = min(ht->tab_size, ht->new_size), + smaller_tab_position = hash & (smaller_tab_size - 1); + + if (smaller_tab_position < ht->rehashing_position) + { + destination_tab = ht->newtab; + destination_tab_size = ht->new_size; + } + else + { + destination_tab = ht->tab; + destination_tab_size = ht->tab_size; + } + } + else + /* In this case we know, we're working on ht->tab and not newtab. */ + { + destination_tab = ht->tab; + destination_tab_size = ht->tab_size; + } + + position = hash & (destination_tab_size - 1); + return &destination_tab[position]; +} + +/* + * Operations of adding, removing, etc. all work on list of entries + * (bucket) to wchich key hashes and they have some common logic, so + * it made sense to make a single function, that does the right + * operation based on an enum passed to it. + */ +static INLINE int +perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket, + const void *key, const void *val, + void **keyptr, void **valptr, + enum op op) +{ + struct ht_node **pairptr, *pair; + + for (pairptr = bucket, pair = *pairptr; + pair; + pairptr = &pair->next, pair = pair->next) + + if (!ht->cmpfunc(key, pair->key)) + { + if (op == ADD) + return HT_KEY_PRESENT; + + if (keyptr) *keyptr = (void*) pair->key; + if (valptr) *valptr = (void*) pair->val; + + switch (op) + { + case GET: + case GET_THREADSAFE: + { + return HT_OK; + } + case SET: + { + pair->key = key; + pair->val = val; + return HT_OK; + } + default: /* case REM */ + { + *pairptr = pair->next; + free(pair); + ht->entries--; + return HT_OK; + } + } + } + + if (op == GET || op == GET_THREADSAFE || op == REM) + return HT_KEY_ABSENT; + + /* op == ADD || op == SET */ + + struct ht_node *new_pair = malloc(sizeof(struct ht_node)); + if (!new_pair) + return HT_NO_MEM; + + *new_pair = (struct ht_node) {.key = key, .val = val}; + push(new_pair, bucket); + ht->entries++; + + return HT_OK; +} + +/* Generic function for performing of adding, removing, setting and getting. */ +static int perform_operation(hashtable_t *ht, const void *key, const void *val, + void **keyptr, void **valptr, enum op op) +{ + bool resizing_in_progress; + + if (op == GET_THREADSAFE) { + resizing_in_progress = resizing_taking_place(ht); + goto skip_resizing; + } + + switch (do_resizing_related_stuff(ht, key, op)) + { + case REHASHING_IN_PROGRESS: + resizing_in_progress = true; + break; + case REHASHING_NOT_IN_PROGRESS: + resizing_in_progress = false; + break; + default: /* case REHASHING_NO_MEM */ + return HT_NO_MEM; + } + + struct ht_node **bucket; + + skip_resizing: + bucket = find_bucket(ht, key, resizing_in_progress); + + return perform_operation_on_bucket(ht, bucket, key, val, + keyptr, valptr, op); +} + +/* The 5 functions below are the main part of the API. */ +int ht_get(hashtable_t *ht, const void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, GET); +} + +int ht_get_threadsafe(hashtable_t *ht, const void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, + GET_THREADSAFE); +} + +int ht_add(hashtable_t *ht, const void *key, const void *val) +{ + return perform_operation(ht, key, val, NULL, NULL, ADD); +} + +int ht_set(hashtable_t *ht, const void *key, const void *val, + void **oldkey, void **oldval) +{ + return perform_operation(ht, key, val, oldkey, oldval, SET); +} + +int ht_rem(hashtable_t *ht, const void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, REM); +} + +/* + * As mentioned in hashtable.h, this func does not deallocate keys + * nor vals. One could use ht_map_destroy() if that is needed. + */ +void ht_destroy(hashtable_t *ht) +{ + ssize_t position; + + if (!ht->entries) + goto free_tab; + + ht_finish_resizing(ht); + + struct ht_node **tab = ht->tab; + + for (position = ht->tab_size - 1; position >= 0; position--) + { + struct ht_node *pair = tab[position], *nextpair; + + while (pair) + { + nextpair = pair->next; + free(pair); + pair = nextpair; + } + } + + free_tab: + free(ht->tab); +} + +void ht_map(hashtable_t *ht, void *arg, + void (*mapfunc)(const void *key, void *val, void *arg)) +{ + ssize_t position; + + if (!ht->entries) + return; + + ht_finish_resizing(ht); + + struct ht_node **tab = ht->tab, *pair; + + for (position = ht->tab_size - 1; position >= 0; position--) + { + for (pair = tab[position]; pair; pair = pair->next) + mapfunc(pair->key, (void*) pair->val, arg); + } +} + +void ht_map_destroy(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)) +{ + /* + * If mapfunc() deallocates keys, the following 2 lines make + * assumption on ht_destroy(), that it doesn't call ht->hashfunc() + * or ht->cmpfunc() on keys. + */ + ht_map(ht, arg, (void (*)(const void*, void*, void*)) mapfunc); + ht_destroy(ht); +} + +/* + * These 2 functions are for easy making of hashtable with strings as + * keys. Note that this hash is *not* secure against DoS attacks. + */ +size_t ht_string_hash(const char *key) +{ + size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4; + + do + { + char shift = ((unsigned char) key[i]) % sizeof(size_t); + hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift))) + ^ key[i]; + } + while (key[i++]); + + return hash; +} + +int ht_string_init(hashtable_t *ht) +{ + return ht_init(ht, (size_t (*)(const void*)) &ht_string_hash, + (int (*)(const void*, const void*)) &strcmp); +} |