/** * C hashtable implementation * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ /* * GENERAL INFO * * You might want to read the beginning of hashtable.h first. * * In some places "rehashing" and in other ones "resizing" seemed to * be the right word to use. They mean more or less the same. * * Functions starting with ht_ are part of the API. Internal functions * are declared static. I also made some of them inline (either * because they were extremely short or only called from 1 place). * * Hashtable size is always a power of 2. * * When the hashtable is ¾ full, a new, 2x bigger table is allocated * and whenever one of 4 basic operations (adding, removing, setting, * getting) occurs, 4 slots are being rehashed from old table into 8 * slots in new table. Similarly, when hashtable is ¼ full, a new, * 2x smaller table is allocated and each of subsequent operations * rehashes 8 entries from old table into 4 in new table. * This mechanism has been made lazier: getting and removing don't * trigger growing of ht even if it's 3/4 full. Similarly, getting, * setting and adding don't trigger shrinking. * Once resizing is triggered, however, any of the operations will * contribute to rehashing. Even if, for example, the operation is * ADD and the table is being shrinked. * This means, that if we have a hashtable of size n which is ¾ full * and growing is triggered, then each subsequent call to * ht_{add,rem,get,set}() rehashes some entries and, depending on * how frequently and how successfully each of these 4 funcs was * called, at the end of resizing we get a size 2n hashtable which is * between ¼ and ½ full. Similarly, if shrinking of a ¼ full * hashtable of size n is triggered, then after some operations we * get a size ½n hashtable, that is somewhere between ¼ and ¾ full. * One can see now, that we always keep the hashtable between ¼ and ¾ * full (with the exception of a minimal size one, that can be empty). */ #include "hashtable.h" #include <stdlib.h> #include <string.h> #include <stdio.h> #include <stdbool.h> #ifdef _USE_INLINE #define INLINE inline #else #define INLINE #endif /* * We won't shrink hashtable below this size. Newly created one will * be this big. */ #define MIN_SIZE 4 /* Special value of ht->rehashing_position. */ #define NOT_REHASHING ((ssize_t) -1) /* * Those are possible return values of do_resizing_related_stuff() * and rehash_some_entries() (which only returns the first 2). */ #define REHASHING_IN_PROGRESS 0 #define REHASHING_NOT_IN_PROGRESS 1 #define REHASHING_NO_MEM 2 /* Struct used to store a pair. */ struct ht_node { const void *key; const void *val; struct ht_node *next; }; enum op { GET, GET_THREADSAFE, ADD, SET, REM, }; int ht_init(hashtable_t *ht, size_t (*hash)(const void *key), int (*cmp)(const void *key1, const void *key2)) { if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**)))) return HT_NO_MEM; ht->tab_size = MIN_SIZE; ht->rehashing_position = NOT_REHASHING; ht->entries = 0; ht->hashfunc = hash; ht->cmpfunc = cmp; return HT_OK; } /* First come some utilities :) */ static INLINE size_t min(size_t n1, size_t n2) { return n1 < n2 ? n1 : n2; } static INLINE size_t hash2(size_t n) { /* I found this "hash improver" on the internet. */ n ^= (n >> 20) ^ (n >> 12); return n ^ (n >> 7) ^ (n >> 4); } /* Below are 2 list-handling utility functions. */ static INLINE struct ht_node *join_lists(struct ht_node *l1, struct ht_node *l2) { if (!l1) return l2; if (!l2) return l1; struct ht_node *l1_last; for (l1_last = l1; l1_last->next; l1_last = l1_last->next); /* Append l2 to the end of l1. */ l1_last->next = l2; /* For convenience return the first element of the resulting list. */ return l1; } static INLINE void push(struct ht_node *node, struct ht_node **list) { node->next = *list; *list = node; } /* * The following 2 rehash_* functions are helpers of rehash_some_entries(). * This func rehashes 1 chain of entries in tab[] into 2 chains in newtab[]. */ static INLINE void rehash_position_growing(hashtable_t *ht) { /* There are 2 possible new positions of an entry in a 2x bigger ht. */ struct ht_node *list0 = NULL, *list1 = NULL; size_t old_position = ht->rehashing_position, new_position0 = old_position, new_position1 = old_position | ht->tab_size; struct ht_node *pair = ht->tab[old_position], *next_pair; while (pair) { next_pair = pair->next; size_t new_position = hash2(ht->hashfunc(pair->key)) & (ht->new_size - 1); push(pair, new_position == new_position1 ? &list1 : &list0); pair = next_pair; } ht->newtab[new_position0] = list0; ht->newtab[new_position1] = list1; ht->rehashing_position++; } /* This func rehashes 2 chains of entries in tab[] into 1 chain in newtab[]. */ static INLINE void rehash_2positions_shrinking(hashtable_t *ht) { size_t new_position = ht->rehashing_position, old_position0 = new_position, old_position1 = new_position | ht->new_size; ht->newtab[new_position] = join_lists(ht->tab[old_position0], ht->tab[old_position1]); ht->rehashing_position++; } /* * Rehashes 4(8) positions from tab to newtab. If those were the last * enetries to rehash, the function takes care of everything * (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS. * Otherwise, returns REHASHING_IN_PROGRESS. * Caller must make sure rehashing was started b4 calling this func. */ static int rehash_some_entries(hashtable_t *ht) { int rehashes_left = 4; if (ht->new_size > ht->tab_size) /* growing ht */ { while(rehashes_left--) rehash_position_growing(ht); if (ht->rehashing_position != ht->tab_size) return REHASHING_IN_PROGRESS; } else /* shrinking ht */ { while(rehashes_left--) rehash_2positions_shrinking(ht); if (ht->rehashing_position != ht->new_size) return REHASHING_IN_PROGRESS; } /* rehashing finishes */ ht->rehashing_position = NOT_REHASHING; ht->tab_size = ht->new_size; free(ht->tab); ht->tab = ht->newtab; return REHASHING_NOT_IN_PROGRESS; } static INLINE bool resizing_taking_place(hashtable_t *ht) { return !(ht->rehashing_position == NOT_REHASHING); } void ht_finish_resizing(hashtable_t *ht) { if (resizing_taking_place(ht)) while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS); } static INLINE bool needs_growing(hashtable_t *ht) { return ht->entries >= 3 * ht->tab_size / 4; } static INLINE bool needs_shrinking(hashtable_t *ht) { return ht->tab_size > MIN_SIZE && ht->entries <= ht->tab_size / 4; } /* * Each of hashtable operations (add, set, rem, get) should also * attempt to do part of resizing. This way resizing operation * which is O(n) is distributed among many hashtable accesses * each of them still being O(1). Without this the the amortized * complexity of ht accesses would still be O(1), but a single access * would sometimes be O(n). * Other function that adds, sets, gets or removes sth from ht uses * this one to do this "part of resizing" mentioned above. * This func returns REHASHING_NO_MEM on failed malloc (won't happen * for GET or REM operation) and REHASHING_[NOT_]IN_PROGRESS otherwise. */ static INLINE int do_resizing_related_stuff(hashtable_t *ht, const void *key, enum op op) { bool resizing = resizing_taking_place(ht); if (!resizing) { size_t new_size; switch (op) { case GET: goto dont_start_resizing; case ADD: case SET: if (needs_growing(ht)) new_size = ht->tab_size * 2; else goto dont_start_resizing; break; default: /* case REM */ if (needs_shrinking(ht)) new_size = ht->tab_size / 2; else goto dont_start_resizing; } struct ht_node **newtab; if (!(newtab = malloc(new_size * sizeof(struct ht_node*)))) return op == REM ? REHASHING_NOT_IN_PROGRESS : REHASHING_NO_MEM; ht->newtab = newtab; ht->new_size = new_size; ht->rehashing_position = 0; resizing = true; } dont_start_resizing: return resizing ? rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS; } /* * This is a chaining hashtable, so each element in the array (table) * is actually a list of entries. All operations (adding, removing, * etc.) need to find the right list of entries (here called "bucket") * for a given key first, so it makes sense to do it in a separate * function. The bucket may be in tab or newtab if resizing is taking * place. Being informed by the caller if resizing is in progress, * this func does not need to check for it by itself. */ static INLINE struct ht_node **find_bucket(hashtable_t *ht, const void *key, bool resizing_in_progress) { size_t hash = hash2(ht->hashfunc(key)), destination_tab_size, position; struct ht_node **destination_tab; if (resizing_in_progress) /* * Here we must check whether our key's bucket is still * in ht->tab or already rehashed to ht->newtab. */ { size_t smaller_tab_size = min(ht->tab_size, ht->new_size), smaller_tab_position = hash & (smaller_tab_size - 1); if (smaller_tab_position < ht->rehashing_position) { destination_tab = ht->newtab; destination_tab_size = ht->new_size; } else { destination_tab = ht->tab; destination_tab_size = ht->tab_size; } } else /* In this case we know, we're working on ht->tab and not newtab. */ { destination_tab = ht->tab; destination_tab_size = ht->tab_size; } position = hash & (destination_tab_size - 1); return &destination_tab[position]; } /* * Operations of adding, removing, etc. all work on list of entries * (bucket) to wchich key hashes and they have some common logic, so * it made sense to make a single function, that does the right * operation based on an enum passed to it. */ static INLINE int perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket, const void *key, const void *val, void **keyptr, void **valptr, enum op op) { struct ht_node **pairptr, *pair; for (pairptr = bucket, pair = *pairptr; pair; pairptr = &pair->next, pair = pair->next) if (!ht->cmpfunc(key, pair->key)) { if (op == ADD) return HT_KEY_PRESENT; if (keyptr) *keyptr = (void*) pair->key; if (valptr) *valptr = (void*) pair->val; switch (op) { case GET: case GET_THREADSAFE: { return HT_OK; } case SET: { pair->key = key; pair->val = val; return HT_OK; } default: /* case REM */ { *pairptr = pair->next; free(pair); ht->entries--; return HT_OK; } } } if (op == GET || op == GET_THREADSAFE || op == REM) return HT_KEY_ABSENT; /* op == ADD || op == SET */ struct ht_node *new_pair = malloc(sizeof(struct ht_node)); if (!new_pair) return HT_NO_MEM; *new_pair = (struct ht_node) {.key = key, .val = val}; push(new_pair, bucket); ht->entries++; return HT_OK; } /* Generic function for performing of adding, removing, setting and getting. */ static int perform_operation(hashtable_t *ht, const void *key, const void *val, void **keyptr, void **valptr, enum op op) { bool resizing_in_progress; if (op == GET_THREADSAFE) { resizing_in_progress = resizing_taking_place(ht); goto skip_resizing; } switch (do_resizing_related_stuff(ht, key, op)) { case REHASHING_IN_PROGRESS: resizing_in_progress = true; break; case REHASHING_NOT_IN_PROGRESS: resizing_in_progress = false; break; default: /* case REHASHING_NO_MEM */ return HT_NO_MEM; } struct ht_node **bucket; skip_resizing: bucket = find_bucket(ht, key, resizing_in_progress); return perform_operation_on_bucket(ht, bucket, key, val, keyptr, valptr, op); } /* The 5 functions below are the main part of the API. */ int ht_get(hashtable_t *ht, const void *key, void **storedkey, void **val) { return perform_operation(ht, key, NULL, storedkey, val, GET); } int ht_get_threadsafe(hashtable_t *ht, const void *key, void **storedkey, void **val) { return perform_operation(ht, key, NULL, storedkey, val, GET_THREADSAFE); } int ht_add(hashtable_t *ht, const void *key, const void *val) { return perform_operation(ht, key, val, NULL, NULL, ADD); } int ht_set(hashtable_t *ht, const void *key, const void *val, void **oldkey, void **oldval) { return perform_operation(ht, key, val, oldkey, oldval, SET); } int ht_rem(hashtable_t *ht, const void *key, void **storedkey, void **val) { return perform_operation(ht, key, NULL, storedkey, val, REM); } /* * As mentioned in hashtable.h, this func does not deallocate keys * nor vals. One could use ht_map_destroy() if that is needed. */ void ht_destroy(hashtable_t *ht) { ssize_t position; if (!ht->entries) goto free_tab; ht_finish_resizing(ht); struct ht_node **tab = ht->tab; for (position = ht->tab_size - 1; position >= 0; position--) { struct ht_node *pair = tab[position], *nextpair; while (pair) { nextpair = pair->next; free(pair); pair = nextpair; } } free_tab: free(ht->tab); } void ht_map(hashtable_t *ht, void *arg, void (*mapfunc)(const void *key, void *val, void *arg)) { ssize_t position; if (!ht->entries) return; ht_finish_resizing(ht); struct ht_node **tab = ht->tab, *pair; for (position = ht->tab_size - 1; position >= 0; position--) { for (pair = tab[position]; pair; pair = pair->next) mapfunc(pair->key, (void*) pair->val, arg); } } void ht_map_destroy(hashtable_t *ht, void *arg, void (*mapfunc)(void *key, void *val, void *arg)) { /* * If mapfunc() deallocates keys, the following 2 lines make * assumption on ht_destroy(), that it doesn't call ht->hashfunc() * or ht->cmpfunc() on keys. */ ht_map(ht, arg, (void (*)(const void*, void*, void*)) mapfunc); ht_destroy(ht); } /* * These 2 functions are for easy making of hashtable with strings as * keys. Note that this hash is *not* secure against DoS attacks. */ size_t ht_string_hash(const char *key) { size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4; do { char shift = ((unsigned char) key[i]) % sizeof(size_t); hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift))) ^ key[i]; } while (key[i++]); return hash; } int ht_string_init(hashtable_t *ht) { return ht_init(ht, (size_t (*)(const void*)) &ht_string_hash, (int (*)(const void*, const void*)) &strcmp); }