From 4d7361009863a0ea2bd75919a79574cdc213c72b Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Tue, 23 Jul 2019 13:47:34 +0200 Subject: initial commit --- hashtable.c | 502 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 502 insertions(+) create mode 100644 hashtable.c (limited to 'hashtable.c') diff --git a/hashtable.c b/hashtable.c new file mode 100644 index 0000000..c2dc582 --- /dev/null +++ b/hashtable.c @@ -0,0 +1,502 @@ +// GENERAL INFO + +// You might want to read the beginning of hashtable.h first. + +// In some places "rehashing" and in other ones "resizing" seemed to +// be the right word to use. They mean more or less the same. + +// Functions starting with ht_ are part of the API. Internal functions +// are declared static. I also made some of them inline (either +// because they were extremely short or only called from 1 place). + +// Hashtable size is always a power of 2. + +// When the hashtable is ¾ full, a new, 2x bigger table is allocated +// and whenever one of 4 basic operations (adding, removing, setting, +// getting) occurs, 4 slots are being rehashed from old table into 8 +// slots in new table. Similarly, when hashtable is ¼ full, a new, +// 2x smaller table is allocated and each of subsequent operations +// rehashes 8 entries from old table into 4 in new table. +// This mechanism has been made lazier: getting and removing don't +// trigger growing of ht even if it's 3/4 full. Similarly, getting, +// setting and adding don't trigger shrinking. +// Once resizing is triggered, however, any of the operations will +// contribute to rehashing. Even if, for example, the operation is +// ADD and the table is being shrinked. +// This means, that if we have a hashtable of size n which is ¾ full +// and growing is triggered, then each subsequent call to +// ht_{add,rem,get,set}() rehashes some entries and, depending on +// how frequently and how successfully each of these 4 funcs was +// called, at the end of resizing we get a size 2n hashtable which is +// between ¼ and ½ full. Similarly, if shrinking of a ¼ full +// hashtable of size n is triggered, then after some operations we +// get a size ½n hashtable, that is somewhere between ¼ and ¾ full. +// One can see now, that we always keep the hashtable between ¼ and ¾ +// full (with the exception of a minimal size one, that can be empty). + +#include "hashtable.h" + +#include +#include +#include +#include + +// We won't shrink hashtable below this size. Newly created one will +// be this big. +#define MIN_SIZE 32 + +// Special value of ht->rehashing_position. +#define NOT_REHASHING ((ssize_t) -1) + +// Those are possible return values of do_resizing_related_stuff() +// and rehash_some_entries() (which only returns the first 2). +#define REHASHING_IN_PROGRESS 0 +#define REHASHING_NOT_IN_PROGRESS 1 +#define REHASHING_NO_MEM 2 + +enum op + { + GET, + ADD, + SET, + REM, + }; + +int ht_init(hashtable_t *ht, + size_t (*hash)(void *key), + int (*cmp)(void *key1, void *key2)) +{ + if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**)))) + return HT_NO_MEM; + + ht->tab_size = MIN_SIZE; + ht->rehashing_position = NOT_REHASHING; + ht->entries = 0; + ht->hashfunc = hash; + ht->cmpfunc = cmp; + + return HT_OK; +} + +// First come some utilities :) + +static inline size_t min(size_t n1, size_t n2) +{ + return n1 < n2 ? n1 : n2; +} + +static inline size_t hash2(size_t n) +{ + // I found this "hash improver" on the internet. + n ^= (n >> 20) ^ (n >> 12); + return n ^ (n >> 7) ^ (n >> 4); +} + +// Below are 2 list-handling utility functions. +static inline struct ht_node *join_lists(struct ht_node *l1, + struct ht_node *l2) +{ + if (!l1) return l2; + if (!l2) return l1; + + struct ht_node *l1_last; + + for (l1_last = l1; l1_last->next; l1_last = l1_last->next); + + // Append l2 to the end of l1. + l1_last->next = l2; + + // For convenience return the first element of the resulting list. + return l1; +} + + +static inline void push(struct ht_node *node, struct ht_node **list) +{ + node->next = *list; + *list = node; +} + +// The following 2 rehash_* functions are helpers of +// rehash_some_entries(). +// This func rehashes 1 chain of entries in tab[] +// into 2 chains in newtab[]. +static inline void rehash_position_growing(hashtable_t *ht) +{ + // There are 2 possible new positions of an entry in a 2x bigger ht. + struct ht_node *list0 = NULL, *list1 = NULL; + + size_t old_position = ht->rehashing_position, + new_position0 = old_position, + new_position1 = old_position | ht->tab_size; + + struct ht_node *pair = ht->tab[old_position], *next_pair; + + while (pair) + { + next_pair = pair->next; + + size_t new_position = hash2(ht->hashfunc(pair->key)) + & (ht->new_size - 1); + + push(pair, new_position == new_position1 ? &list1 : &list0); + + pair = next_pair; + } + + ht->newtab[new_position0] = list0; + ht->newtab[new_position1] = list1; + + ht->rehashing_position++; +} + +// This func rehashes 2 chains of entries in tab[] +// into 1 chain in newtab[]. +static inline void rehash_2positions_shrinking(hashtable_t *ht) +{ + size_t new_position = ht->rehashing_position, + old_position0 = new_position, + old_position1 = new_position | ht->new_size; + + ht->newtab[new_position] = join_lists(ht->tab[old_position0], + ht->tab[old_position1]); + + ht->rehashing_position++; +} + +// Rehashes 4(8) positions from tab to newtab. If those were the last +// enetries to rehash, the function takes care of everything +// (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS. +// Otherwise, returns REHASHING_IN_PROGRESS. +// Caller must make sure rehashing was started b4 calling this func. +static int rehash_some_entries(hashtable_t *ht) +{ + int rehashes_left = 4; + + if (ht->new_size > ht->tab_size) // growing ht + { + while(rehashes_left--) rehash_position_growing(ht); + if (ht->rehashing_position != ht->tab_size) + return REHASHING_IN_PROGRESS; + } + else // shrinking ht + { + while(rehashes_left--) rehash_2positions_shrinking(ht); + if (ht->rehashing_position != ht->new_size) + return REHASHING_IN_PROGRESS; + } + + // rehashing finishes + ht->rehashing_position = NOT_REHASHING; + ht->tab_size = ht->new_size; + free(ht->tab); + ht->tab = ht->newtab; + + return REHASHING_NOT_IN_PROGRESS; +} + +static inline bool resizing_taking_place(hashtable_t *ht) +{ + return !(ht->rehashing_position == NOT_REHASHING); +} + +void ht_finish_resizing(hashtable_t *ht) +{ + if (resizing_taking_place(ht)) + while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS); +} + +static inline bool needs_growing(hashtable_t *ht) +{ + return ht->entries == 3 * ht->tab_size / 4; +} + +static inline bool needs_shrinking(hashtable_t *ht) +{ + return ht->tab_size > MIN_SIZE + && ht->entries == ht->tab_size / 4; +} + +// Each of hashtable operations (add, set, rem, get) should also +// attempt to do part of resizing. This way resizing operation +// which is O(n) is distributed among many hashtable accesses +// each of them still being O(1). Without this the the amortized +// complexity of ht accesses would still be O(1), but a single access +// would sometimes be O(n). +// Other function that adds, sets, gets or removes sth from ht uses +// this one to do this "part of resizing" mentioned above. +// This func returns REHASHING_NO_MEM on failed malloc (won't happen +// for GET operation) and REHASHING_(NOT_)IN_PROGRESS otherwise. +static inline int do_resizing_related_stuff(hashtable_t *ht, + void *key, enum op op) +{ + bool resizing = resizing_taking_place(ht); + + if (!resizing) + { + size_t new_size; + + switch (op) + { + case GET: + goto dont_start_resizing; + case ADD: + case SET: + if (needs_growing(ht)) + new_size = ht->tab_size * 2; + else + goto dont_start_resizing; + break; + default: // case REM + if (needs_shrinking(ht)) + new_size = ht->tab_size / 2; + else + goto dont_start_resizing; + } + + struct ht_node **newtab; + if (!(newtab = malloc(new_size * sizeof(struct ht_node*)))) + return REHASHING_NO_MEM; + + ht->newtab = newtab; + ht->new_size = new_size; + ht->rehashing_position = 0; + + resizing = true; + } + + dont_start_resizing: + + return resizing ? + rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS; +} + +// This is a chaining hashtable, so each element in the array (table) +// is actually a list of entries. All operations (adding, removing, +// etc.) need to find the right list of entries (here called "bucket") +// for a given key first, so it makes sense to do it in a separate +// function. The bucket may be in tab or newtab if resizing is taking +// place. Being informed by the caller if resizing is in progress, +// this func does not need to check for it itself. +static inline struct ht_node **find_bucket(hashtable_t *ht, void *key, + bool resizing_in_progress) +{ + size_t hash = hash2(ht->hashfunc(key)), + destination_tab_size, position; + + struct ht_node **destination_tab; + + if (resizing_in_progress) + // Here we must check whether our key's bucket is still + // in ht->tab or already rehashed to ht->newtab. + { + size_t smaller_tab_size = min(ht->tab_size, ht->new_size), + smaller_tab_position = hash & (smaller_tab_size - 1); + + if (smaller_tab_position < ht->rehashing_position) + { + destination_tab = ht->newtab; + destination_tab_size = ht->new_size; + } + else + { + destination_tab = ht->tab; + destination_tab_size = ht->tab_size; + } + } + else + // In this case we know, we're working on ht->tab and not newtab. + { + destination_tab = ht->tab; + destination_tab_size = ht->tab_size; + } + + position = hash & (destination_tab_size - 1); + return &destination_tab[position]; +} + +// Operations of adding, removing, etc. all work on list of entries +// (bucket) to wchich key hashes and they have some common logic, so +// it made sense to make a single function, that does the right +// operation based on an enum passed to it. +static inline int +perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket, + void *key, void *val, + void **keyptr, void **valptr, + enum op op) +{ + for (struct ht_node **pairptr = bucket, *pair = *pairptr; + pair; + pairptr = &pair->next, pair = pair->next) + + if (!ht->cmpfunc(key, pair->key)) + { + if (op == ADD) + return HT_KEY_PRESENT; + + if (keyptr) *keyptr = pair->key; + if (valptr) *valptr = pair->val; + + switch (op) + { + case GET: + { + return HT_OK; + } + case SET: + { + pair->key = key; + pair->val = val; + return HT_OK; + } + default: // case REM + { + *pairptr = pair->next; + free(pair); + ht->entries--; + return HT_OK; + } + } + } + + if (op == GET || op == REM) + return HT_KEY_ABSENT; + + // op == ADD || op == SET + + struct ht_node *new_pair = malloc(sizeof(struct ht_node)); + if (!new_pair) + return HT_NO_MEM; + + *new_pair = (struct ht_node) {.key = key, .val = val}; + push(new_pair, bucket); + ht->entries++; + + return HT_OK; +} + +// Generic function for performing of adding, removing, setting and +// getting. +static int perform_operation(hashtable_t *ht, void *key, void *val, + void **keyptr, void **valptr, enum op op) +{ + bool resizing_in_progress; + + switch (do_resizing_related_stuff(ht, key, op)) + { + case REHASHING_IN_PROGRESS: + resizing_in_progress = true; + break; + case REHASHING_NOT_IN_PROGRESS: + resizing_in_progress = false; + break; + default: // case REHASHING_NO_MEM + return HT_NO_MEM; + } + + struct ht_node **bucket = + find_bucket(ht, key, resizing_in_progress); + + return perform_operation_on_bucket(ht, bucket, key, val, + keyptr, valptr, op); +} + +// The 4 functions below are the main part of the API. +int ht_get(hashtable_t *ht, void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, GET); +} + +int ht_add(hashtable_t *ht, void *key, void *val) +{ + return perform_operation(ht, key, val, NULL, NULL, ADD); +} + +int ht_set(hashtable_t *ht, void *key, void *val, + void **oldkey, void **oldval) +{ + return perform_operation(ht, key, val, oldkey, oldval, SET); +} + +int ht_rem(hashtable_t *ht, void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, REM); +} + +// As mentioned in hashtable.h, this func does not deallocate keys +// nor vals. One could use ht_map() to free() them before calling this +// func. +void ht_destroy(hashtable_t *ht) +{ + if (!ht->entries) return; + ht_finish_resizing(ht); + + struct ht_node **tab = ht->tab; + + for (ssize_t position = ht->tab_size - 1; + position >= 0; position--) + { + struct ht_node *pair = tab[position], *nextpair; + + while (pair) + { + nextpair = pair->next; + free(pair); + pair = nextpair; + } + } + + free(ht->tab); +} + +void ht_map(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)) +{ + if (!ht->entries) return; + ht_finish_resizing(ht); + + struct ht_node **tab = ht->tab; + + for (ssize_t position = ht->tab_size - 1; + position >= 0; position--) + + for (struct ht_node *pair = tab[position]; + pair; pair = pair->next) + + mapfunc(pair->key, pair->val, arg); +} + +void ht_map_destroy(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)) +{ + // If mapfunc() deallocates keys, then the next 2 lines require + // assumption on ht_destroy(), that it doesn't call ht->hashfunc() + // or ht->cmpfunc() on keys. + ht_map(ht, arg, mapfunc); + ht_destroy(ht); +} + +// These 2 functions are for easy making of hashtable with strings as +// keys. +size_t ht_string_hash(const char *key) +{ + size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4; + + do + { + char shift = ((unsigned char) key[i]) % sizeof(size_t); + hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift))) + ^ key[i]; + } + while (key[i++]); + + return hash; +} + +int ht_string_init(hashtable_t *ht) +{ + return ht_init(ht, (size_t (*)(void*)) &ht_string_hash, + (int (*)(void*, void*)) &strcmp); +} -- cgit v1.2.3