// https://libregit.org/koszko/C_hashtable // Note that this version might or might not be more up-to-date than // the one linked. // GENERAL INFO // You might want to read the beginning of hashtable.h first. // In some places "rehashing" and in other ones "resizing" seemed to // be the right word to use. They mean more or less the same. // Functions starting with ht_ are part of the API. Internal functions // are declared static. I also made some of them inline (either // because they were extremely short or only called from 1 place). // Hashtable size is always a power of 2. // When the hashtable is ¾ full, a new, 2x bigger table is allocated // and whenever one of 4 basic operations (adding, removing, setting, // getting) occurs, 4 slots are being rehashed from old table into 8 // slots in new table. Similarly, when hashtable is ¼ full, a new, // 2x smaller table is allocated and each of subsequent operations // rehashes 8 entries from old table into 4 in new table. // This mechanism has been made lazier: getting and removing don't // trigger growing of ht even if it's 3/4 full. Similarly, getting, // setting and adding don't trigger shrinking. // Once resizing is triggered, however, any of the operations will // contribute to rehashing. Even if, for example, the operation is // ADD and the table is being shrinked. // This means, that if we have a hashtable of size n which is ¾ full // and growing is triggered, then each subsequent call to // ht_{add,rem,get,set}() rehashes some entries and, depending on // how frequently and how successfully each of these 4 funcs was // called, at the end of resizing we get a size 2n hashtable which is // between ¼ and ½ full. Similarly, if shrinking of a ¼ full // hashtable of size n is triggered, then after some operations we // get a size ½n hashtable, that is somewhere between ¼ and ¾ full. // One can see now, that we always keep the hashtable between ¼ and ¾ // full (with the exception of a minimal size one, that can be empty). #include "hashtable.h" #include #include #include #include // We won't shrink hashtable below this size. Newly created one will // be this big. #define MIN_SIZE 32 // Special value of ht->rehashing_position. #define NOT_REHASHING ((ssize_t) -1) // Those are possible return values of do_resizing_related_stuff() // and rehash_some_entries() (which only returns the first 2). #define REHASHING_IN_PROGRESS 0 #define REHASHING_NOT_IN_PROGRESS 1 #define REHASHING_NO_MEM 2 /* Struct used to store a pair. */ struct ht_node { const void *key; const void *val; struct ht_node *next; }; enum op { GET, ADD, SET, REM, }; int ht_init(hashtable_t *ht, size_t (*hash)(const void *key), int (*cmp)(const void *key1, const void *key2)) { if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**)))) return HT_NO_MEM; ht->tab_size = MIN_SIZE; ht->rehashing_position = NOT_REHASHING; ht->entries = 0; ht->hashfunc = hash; ht->cmpfunc = cmp; return HT_OK; } // First come some utilities :) static inline size_t min(size_t n1, size_t n2) { return n1 < n2 ? n1 : n2; } static inline size_t hash2(size_t n) { // I found this "hash improver" on the internet. n ^= (n >> 20) ^ (n >> 12); return n ^ (n >> 7) ^ (n >> 4); } // Below are 2 list-handling utility functions. static inline struct ht_node *join_lists(struct ht_node *l1, struct ht_node *l2) { if (!l1) return l2; if (!l2) return l1; struct ht_node *l1_last; for (l1_last = l1; l1_last->next; l1_last = l1_last->next); // Append l2 to the end of l1. l1_last->next = l2; // For convenience return the first element of the resulting list. return l1; } static inline void push(struct ht_node *node, struct ht_node **list) { node->next = *list; *list = node; } // The following 2 rehash_* functions are helpers of // rehash_some_entries(). // This func rehashes 1 chain of entries in tab[] // into 2 chains in newtab[]. static inline void rehash_position_growing(hashtable_t *ht) { // There are 2 possible new positions of an entry in a 2x bigger ht. struct ht_node *list0 = NULL, *list1 = NULL; size_t old_position = ht->rehashing_position, new_position0 = old_position, new_position1 = old_position | ht->tab_size; struct ht_node *pair = ht->tab[old_position], *next_pair; while (pair) { next_pair = pair->next; size_t new_position = hash2(ht->hashfunc(pair->key)) & (ht->new_size - 1); push(pair, new_position == new_position1 ? &list1 : &list0); pair = next_pair; } ht->newtab[new_position0] = list0; ht->newtab[new_position1] = list1; ht->rehashing_position++; } // This func rehashes 2 chains of entries in tab[] // into 1 chain in newtab[]. static inline void rehash_2positions_shrinking(hashtable_t *ht) { size_t new_position = ht->rehashing_position, old_position0 = new_position, old_position1 = new_position | ht->new_size; ht->newtab[new_position] = join_lists(ht->tab[old_position0], ht->tab[old_position1]); ht->rehashing_position++; } // Rehashes 4(8) positions from tab to newtab. If those were the last // enetries to rehash, the function takes care of everything // (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS. // Otherwise, returns REHASHING_IN_PROGRESS. // Caller must make sure rehashing was started b4 calling this func. static int rehash_some_entries(hashtable_t *ht) { int rehashes_left = 4; if (ht->new_size > ht->tab_size) // growing ht { while(rehashes_left--) rehash_position_growing(ht); if (ht->rehashing_position != ht->tab_size) return REHASHING_IN_PROGRESS; } else // shrinking ht { while(rehashes_left--) rehash_2positions_shrinking(ht); if (ht->rehashing_position != ht->new_size) return REHASHING_IN_PROGRESS; } // rehashing finishes ht->rehashing_position = NOT_REHASHING; ht->tab_size = ht->new_size; free(ht->tab); ht->tab = ht->newtab; return REHASHING_NOT_IN_PROGRESS; } static inline bool resizing_taking_place(hashtable_t *ht) { return !(ht->rehashing_position == NOT_REHASHING); } void ht_finish_resizing(hashtable_t *ht) { if (resizing_taking_place(ht)) while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS); } static inline bool needs_growing(hashtable_t *ht) { return ht->entries >= 3 * ht->tab_size / 4; } static inline bool needs_shrinking(hashtable_t *ht) { return ht->tab_size > MIN_SIZE && ht->entries <= ht->tab_size / 4; } // Each of hashtable operations (add, set, rem, get) should also // attempt to do part of resizing. This way resizing operation // which is O(n) is distributed among many hashtable accesses // each of them still being O(1). Without this the the amortized // complexity of ht accesses would still be O(1), but a single access // would sometimes be O(n). // Other function that adds, sets, gets or removes sth from ht uses // this one to do this "part of resizing" mentioned above. // This func returns REHASHING_NO_MEM on failed malloc (won't happen // for GET or REM operation) and REHASHING_[NOT_]IN_PROGRESS otherwise. static inline int do_resizing_related_stuff(hashtable_t *ht, const void *key, enum op op) { bool resizing = resizing_taking_place(ht); if (!resizing) { size_t new_size; switch (op) { case GET: goto dont_start_resizing; case ADD: case SET: if (needs_growing(ht)) new_size = ht->tab_size * 2; else goto dont_start_resizing; break; default: // case REM if (needs_shrinking(ht)) new_size = ht->tab_size / 2; else goto dont_start_resizing; } struct ht_node **newtab; if (!(newtab = malloc(new_size * sizeof(struct ht_node*)))) return op == REM ? REHASHING_NOT_IN_PROGRESS : REHASHING_NO_MEM; ht->newtab = newtab; ht->new_size = new_size; ht->rehashing_position = 0; resizing = true; } dont_start_resizing: return resizing ? rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS; } // This is a chaining hashtable, so each element in the array (table) // is actually a list of entries. All operations (adding, removing, // etc.) need to find the right list of entries (here called "bucket") // for a given key first, so it makes sense to do it in a separate // function. The bucket may be in tab or newtab if resizing is taking // place. Being informed by the caller if resizing is in progress, // this func does not need to check for it by itself. static inline struct ht_node **find_bucket(hashtable_t *ht, const void *key, bool resizing_in_progress) { size_t hash = hash2(ht->hashfunc(key)), destination_tab_size, position; struct ht_node **destination_tab; if (resizing_in_progress) // Here we must check whether our key's bucket is still // in ht->tab or already rehashed to ht->newtab. { size_t smaller_tab_size = min(ht->tab_size, ht->new_size), smaller_tab_position = hash & (smaller_tab_size - 1); if (smaller_tab_position < ht->rehashing_position) { destination_tab = ht->newtab; destination_tab_size = ht->new_size; } else { destination_tab = ht->tab; destination_tab_size = ht->tab_size; } } else // In this case we know, we're working on ht->tab and not newtab. { destination_tab = ht->tab; destination_tab_size = ht->tab_size; } position = hash & (destination_tab_size - 1); return &destination_tab[position]; } // Operations of adding, removing, etc. all work on list of entries // (bucket) to wchich key hashes and they have some common logic, so // it made sense to make a single function, that does the right // operation based on an enum passed to it. static inline int perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket, const void *key, const void *val, void **keyptr, void **valptr, enum op op) { for (struct ht_node **pairptr = bucket, *pair = *pairptr; pair; pairptr = &pair->next, pair = pair->next) if (!ht->cmpfunc(key, pair->key)) { if (op == ADD) return HT_KEY_PRESENT; if (keyptr) *keyptr = (void*) pair->key; if (valptr) *valptr = (void*) pair->val; switch (op) { case GET: { return HT_OK; } case SET: { pair->key = key; pair->val = val; return HT_OK; } default: // case REM { *pairptr = pair->next; free(pair); ht->entries--; return HT_OK; } } } if (op == GET || op == REM) return HT_KEY_ABSENT; // op == ADD || op == SET struct ht_node *new_pair = malloc(sizeof(struct ht_node)); if (!new_pair) return HT_NO_MEM; *new_pair = (struct ht_node) {.key = key, .val = val}; push(new_pair, bucket); ht->entries++; return HT_OK; } // Generic function for performing of adding, removing, setting and // getting. static int perform_operation(hashtable_t *ht, const void *key, const void *val, void **keyptr, void **valptr, enum op op) { bool resizing_in_progress; switch (do_resizing_related_stuff(ht, key, op)) { case REHASHING_IN_PROGRESS: resizing_in_progress = true; break; case REHASHING_NOT_IN_PROGRESS: resizing_in_progress = false; break; default: // case REHASHING_NO_MEM return HT_NO_MEM; } struct ht_node **bucket = find_bucket(ht, key, resizing_in_progress); return perform_operation_on_bucket(ht, bucket, key, val, keyptr, valptr, op); } // The 4 functions below are the main part of the API. int ht_get(hashtable_t *ht, const void *key, void **storedkey, void **val) { return perform_operation(ht, key, NULL, storedkey, val, GET); } int ht_add(hashtable_t *ht, const void *key, const void *val) { return perform_operation(ht, key, val, NULL, NULL, ADD); } int ht_set(hashtable_t *ht, const void *key, const void *val, void **oldkey, void **oldval) { return perform_operation(ht, key, val, oldkey, oldval, SET); } int ht_rem(hashtable_t *ht, const void *key, void **storedkey, void **val) { return perform_operation(ht, key, NULL, storedkey, val, REM); } // As mentioned in hashtable.h, this func does not deallocate keys // nor vals. One could use ht_map_destroy() if that is needed. void ht_destroy(hashtable_t *ht) { if (!ht->entries) return; ht_finish_resizing(ht); struct ht_node **tab = ht->tab; for (ssize_t position = ht->tab_size - 1; position >= 0; position--) { struct ht_node *pair = tab[position], *nextpair; while (pair) { nextpair = pair->next; free(pair); pair = nextpair; } } free(ht->tab); } void ht_map(hashtable_t *ht, void *arg, void (*mapfunc)(const void *key, void *val, void *arg)) { if (!ht->entries) return; ht_finish_resizing(ht); struct ht_node **tab = ht->tab; for (ssize_t position = ht->tab_size - 1; position >= 0; position--) for (struct ht_node *pair = tab[position]; pair; pair = pair->next) mapfunc(pair->key, (void*) pair->val, arg); } void ht_map_destroy(hashtable_t *ht, void *arg, void (*mapfunc)(void *key, void *val, void *arg)) { // If mapfunc() deallocates keys, the following 2 lines make // assumption on ht_destroy(), that it doesn't call ht->hashfunc() // or ht->cmpfunc() on keys. ht_map(ht, arg, (void (*)(const void*, void*, void*)) mapfunc); ht_destroy(ht); } // These 2 functions are for easy making of hashtable with strings as // keys. Note that this hash is *not* secure against DoS attacks. size_t ht_string_hash(const char *key) { size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4; do { char shift = ((unsigned char) key[i]) % sizeof(size_t); hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift))) ^ key[i]; } while (key[i++]); return hash; } int ht_string_init(hashtable_t *ht) { return ht_init(ht, (size_t (*)(const void*)) &ht_string_hash, (int (*)(const void*, const void*)) &strcmp); }