From 4d7361009863a0ea2bd75919a79574cdc213c72b Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Tue, 23 Jul 2019 13:47:34 +0200 Subject: initial commit --- example_use.c | 135 ++++++++++++++++ hashtable.c | 502 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ hashtable.h | 109 +++++++++++++ 3 files changed, 746 insertions(+) create mode 100644 example_use.c create mode 100644 hashtable.c create mode 100644 hashtable.h diff --git a/example_use.c b/example_use.c new file mode 100644 index 0000000..bbd2433 --- /dev/null +++ b/example_use.c @@ -0,0 +1,135 @@ +// I didn't take the effort to clean up this file as I did with +// hashtable.c and hashtable.h. It's not really important, tho. +// Instead of this "console hashtable" I should make some test cases +// (which I probably won't be motivated enough to do anyway). + +#include +#include +#include + +#include "hashtable.h" + +void print_pair(const char *key, const char *val, void *dummy){ + printf("key: %s val: %s\n", key, val); +} + +void free_pair(void *key, void *val, void *dummy){ + free(key); + free(val); +} + +int main(int argc, char **argv){ + hashtable_t ht; + if (ht_string_init(&ht)) // HT_NO_MEM + { + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + exit(-1); + } + + char buf[100]; + + while (scanf("%99s", buf) != EOF) { + if (!strcmp(buf, "add")) { + if (scanf("%99s", buf) == EOF) break; + + size_t len = strlen(buf) + 1; + char *key = malloc(len); + if (key == NULL) { + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + exit(1); + } + + memcpy(key, buf, len); + + if (scanf("%99s", buf) == EOF) { + free(key); + break; + } + + len = strlen(buf) + 1; + char *val = malloc(len); + if (val == NULL) { + free(key); + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + exit(1); + } + + memcpy(val, buf, len); + + switch (ht_add(&ht, key, val)) + { + case HT_OK: + break; + case HT_NO_MEM: + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + free(key); free(val); + exit(1); + default: // case HT_KEY_PRESENT + printf("Key already present!\n"); + } + } + else if (!strcmp(buf, "set")) { + if (scanf("%99s", buf) == EOF) break; + + size_t len = strlen(buf) + 1; + char *key = malloc(len); + if (key == NULL) { + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + exit(1); + } + + memcpy(key, buf, len); + + if (scanf("%99s", buf) == EOF) break; + + len = strlen(buf) + 1; + char *val = malloc(len); + if (val == NULL) { + free(key); + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + exit(1); + } + + memcpy(val, buf, len); + + void *oldkey = NULL, *oldval = NULL; + if (ht_set(&ht, key, val, &oldkey, &oldval) != 0) { + free(key); free(val); + + fprintf(stderr, "ERROR, couldn't allocate memory!\n"); + exit(1); + } + + if (oldval != NULL) { + printf("Replaced: %s\n", (char*) oldval); + free(oldval); free(oldkey); + } + } + else if (!strcmp(buf, "get")) { + if (scanf("%99s", buf) == EOF) break; + + void *val; + if (ht_get(&ht, buf, NULL, &val)) + printf("No such entry!\n"); + else printf("%s\n", (char*) val); + } + else if (!strcmp(buf, "rem")) { + if (scanf("%99s", buf) == EOF) break; + + void *storedkey, *val; + if (ht_rem(&ht, buf, &storedkey, &val)) + printf("No such entry!\n"); + else { + printf("%s\n", (char*) val); + free(val); free(storedkey); + } + } + else if(!strcmp(buf, "entries")) printf("%zu\n", ht.entries); + else if(!strcmp(buf, "print")) + ht_map(&ht, NULL, (void (*)(void*, void*, void*)) &print_pair); + else printf("I have no idea what you're talking about...\n"); + } + + ht_map_destroy(&ht, NULL, + (void (*)(void*, void*, void*)) &free_pair); +} diff --git a/hashtable.c b/hashtable.c new file mode 100644 index 0000000..c2dc582 --- /dev/null +++ b/hashtable.c @@ -0,0 +1,502 @@ +// GENERAL INFO + +// You might want to read the beginning of hashtable.h first. + +// In some places "rehashing" and in other ones "resizing" seemed to +// be the right word to use. They mean more or less the same. + +// Functions starting with ht_ are part of the API. Internal functions +// are declared static. I also made some of them inline (either +// because they were extremely short or only called from 1 place). + +// Hashtable size is always a power of 2. + +// When the hashtable is ¾ full, a new, 2x bigger table is allocated +// and whenever one of 4 basic operations (adding, removing, setting, +// getting) occurs, 4 slots are being rehashed from old table into 8 +// slots in new table. Similarly, when hashtable is ¼ full, a new, +// 2x smaller table is allocated and each of subsequent operations +// rehashes 8 entries from old table into 4 in new table. +// This mechanism has been made lazier: getting and removing don't +// trigger growing of ht even if it's 3/4 full. Similarly, getting, +// setting and adding don't trigger shrinking. +// Once resizing is triggered, however, any of the operations will +// contribute to rehashing. Even if, for example, the operation is +// ADD and the table is being shrinked. +// This means, that if we have a hashtable of size n which is ¾ full +// and growing is triggered, then each subsequent call to +// ht_{add,rem,get,set}() rehashes some entries and, depending on +// how frequently and how successfully each of these 4 funcs was +// called, at the end of resizing we get a size 2n hashtable which is +// between ¼ and ½ full. Similarly, if shrinking of a ¼ full +// hashtable of size n is triggered, then after some operations we +// get a size ½n hashtable, that is somewhere between ¼ and ¾ full. +// One can see now, that we always keep the hashtable between ¼ and ¾ +// full (with the exception of a minimal size one, that can be empty). + +#include "hashtable.h" + +#include +#include +#include +#include + +// We won't shrink hashtable below this size. Newly created one will +// be this big. +#define MIN_SIZE 32 + +// Special value of ht->rehashing_position. +#define NOT_REHASHING ((ssize_t) -1) + +// Those are possible return values of do_resizing_related_stuff() +// and rehash_some_entries() (which only returns the first 2). +#define REHASHING_IN_PROGRESS 0 +#define REHASHING_NOT_IN_PROGRESS 1 +#define REHASHING_NO_MEM 2 + +enum op + { + GET, + ADD, + SET, + REM, + }; + +int ht_init(hashtable_t *ht, + size_t (*hash)(void *key), + int (*cmp)(void *key1, void *key2)) +{ + if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**)))) + return HT_NO_MEM; + + ht->tab_size = MIN_SIZE; + ht->rehashing_position = NOT_REHASHING; + ht->entries = 0; + ht->hashfunc = hash; + ht->cmpfunc = cmp; + + return HT_OK; +} + +// First come some utilities :) + +static inline size_t min(size_t n1, size_t n2) +{ + return n1 < n2 ? n1 : n2; +} + +static inline size_t hash2(size_t n) +{ + // I found this "hash improver" on the internet. + n ^= (n >> 20) ^ (n >> 12); + return n ^ (n >> 7) ^ (n >> 4); +} + +// Below are 2 list-handling utility functions. +static inline struct ht_node *join_lists(struct ht_node *l1, + struct ht_node *l2) +{ + if (!l1) return l2; + if (!l2) return l1; + + struct ht_node *l1_last; + + for (l1_last = l1; l1_last->next; l1_last = l1_last->next); + + // Append l2 to the end of l1. + l1_last->next = l2; + + // For convenience return the first element of the resulting list. + return l1; +} + + +static inline void push(struct ht_node *node, struct ht_node **list) +{ + node->next = *list; + *list = node; +} + +// The following 2 rehash_* functions are helpers of +// rehash_some_entries(). +// This func rehashes 1 chain of entries in tab[] +// into 2 chains in newtab[]. +static inline void rehash_position_growing(hashtable_t *ht) +{ + // There are 2 possible new positions of an entry in a 2x bigger ht. + struct ht_node *list0 = NULL, *list1 = NULL; + + size_t old_position = ht->rehashing_position, + new_position0 = old_position, + new_position1 = old_position | ht->tab_size; + + struct ht_node *pair = ht->tab[old_position], *next_pair; + + while (pair) + { + next_pair = pair->next; + + size_t new_position = hash2(ht->hashfunc(pair->key)) + & (ht->new_size - 1); + + push(pair, new_position == new_position1 ? &list1 : &list0); + + pair = next_pair; + } + + ht->newtab[new_position0] = list0; + ht->newtab[new_position1] = list1; + + ht->rehashing_position++; +} + +// This func rehashes 2 chains of entries in tab[] +// into 1 chain in newtab[]. +static inline void rehash_2positions_shrinking(hashtable_t *ht) +{ + size_t new_position = ht->rehashing_position, + old_position0 = new_position, + old_position1 = new_position | ht->new_size; + + ht->newtab[new_position] = join_lists(ht->tab[old_position0], + ht->tab[old_position1]); + + ht->rehashing_position++; +} + +// Rehashes 4(8) positions from tab to newtab. If those were the last +// enetries to rehash, the function takes care of everything +// (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS. +// Otherwise, returns REHASHING_IN_PROGRESS. +// Caller must make sure rehashing was started b4 calling this func. +static int rehash_some_entries(hashtable_t *ht) +{ + int rehashes_left = 4; + + if (ht->new_size > ht->tab_size) // growing ht + { + while(rehashes_left--) rehash_position_growing(ht); + if (ht->rehashing_position != ht->tab_size) + return REHASHING_IN_PROGRESS; + } + else // shrinking ht + { + while(rehashes_left--) rehash_2positions_shrinking(ht); + if (ht->rehashing_position != ht->new_size) + return REHASHING_IN_PROGRESS; + } + + // rehashing finishes + ht->rehashing_position = NOT_REHASHING; + ht->tab_size = ht->new_size; + free(ht->tab); + ht->tab = ht->newtab; + + return REHASHING_NOT_IN_PROGRESS; +} + +static inline bool resizing_taking_place(hashtable_t *ht) +{ + return !(ht->rehashing_position == NOT_REHASHING); +} + +void ht_finish_resizing(hashtable_t *ht) +{ + if (resizing_taking_place(ht)) + while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS); +} + +static inline bool needs_growing(hashtable_t *ht) +{ + return ht->entries == 3 * ht->tab_size / 4; +} + +static inline bool needs_shrinking(hashtable_t *ht) +{ + return ht->tab_size > MIN_SIZE + && ht->entries == ht->tab_size / 4; +} + +// Each of hashtable operations (add, set, rem, get) should also +// attempt to do part of resizing. This way resizing operation +// which is O(n) is distributed among many hashtable accesses +// each of them still being O(1). Without this the the amortized +// complexity of ht accesses would still be O(1), but a single access +// would sometimes be O(n). +// Other function that adds, sets, gets or removes sth from ht uses +// this one to do this "part of resizing" mentioned above. +// This func returns REHASHING_NO_MEM on failed malloc (won't happen +// for GET operation) and REHASHING_(NOT_)IN_PROGRESS otherwise. +static inline int do_resizing_related_stuff(hashtable_t *ht, + void *key, enum op op) +{ + bool resizing = resizing_taking_place(ht); + + if (!resizing) + { + size_t new_size; + + switch (op) + { + case GET: + goto dont_start_resizing; + case ADD: + case SET: + if (needs_growing(ht)) + new_size = ht->tab_size * 2; + else + goto dont_start_resizing; + break; + default: // case REM + if (needs_shrinking(ht)) + new_size = ht->tab_size / 2; + else + goto dont_start_resizing; + } + + struct ht_node **newtab; + if (!(newtab = malloc(new_size * sizeof(struct ht_node*)))) + return REHASHING_NO_MEM; + + ht->newtab = newtab; + ht->new_size = new_size; + ht->rehashing_position = 0; + + resizing = true; + } + + dont_start_resizing: + + return resizing ? + rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS; +} + +// This is a chaining hashtable, so each element in the array (table) +// is actually a list of entries. All operations (adding, removing, +// etc.) need to find the right list of entries (here called "bucket") +// for a given key first, so it makes sense to do it in a separate +// function. The bucket may be in tab or newtab if resizing is taking +// place. Being informed by the caller if resizing is in progress, +// this func does not need to check for it itself. +static inline struct ht_node **find_bucket(hashtable_t *ht, void *key, + bool resizing_in_progress) +{ + size_t hash = hash2(ht->hashfunc(key)), + destination_tab_size, position; + + struct ht_node **destination_tab; + + if (resizing_in_progress) + // Here we must check whether our key's bucket is still + // in ht->tab or already rehashed to ht->newtab. + { + size_t smaller_tab_size = min(ht->tab_size, ht->new_size), + smaller_tab_position = hash & (smaller_tab_size - 1); + + if (smaller_tab_position < ht->rehashing_position) + { + destination_tab = ht->newtab; + destination_tab_size = ht->new_size; + } + else + { + destination_tab = ht->tab; + destination_tab_size = ht->tab_size; + } + } + else + // In this case we know, we're working on ht->tab and not newtab. + { + destination_tab = ht->tab; + destination_tab_size = ht->tab_size; + } + + position = hash & (destination_tab_size - 1); + return &destination_tab[position]; +} + +// Operations of adding, removing, etc. all work on list of entries +// (bucket) to wchich key hashes and they have some common logic, so +// it made sense to make a single function, that does the right +// operation based on an enum passed to it. +static inline int +perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket, + void *key, void *val, + void **keyptr, void **valptr, + enum op op) +{ + for (struct ht_node **pairptr = bucket, *pair = *pairptr; + pair; + pairptr = &pair->next, pair = pair->next) + + if (!ht->cmpfunc(key, pair->key)) + { + if (op == ADD) + return HT_KEY_PRESENT; + + if (keyptr) *keyptr = pair->key; + if (valptr) *valptr = pair->val; + + switch (op) + { + case GET: + { + return HT_OK; + } + case SET: + { + pair->key = key; + pair->val = val; + return HT_OK; + } + default: // case REM + { + *pairptr = pair->next; + free(pair); + ht->entries--; + return HT_OK; + } + } + } + + if (op == GET || op == REM) + return HT_KEY_ABSENT; + + // op == ADD || op == SET + + struct ht_node *new_pair = malloc(sizeof(struct ht_node)); + if (!new_pair) + return HT_NO_MEM; + + *new_pair = (struct ht_node) {.key = key, .val = val}; + push(new_pair, bucket); + ht->entries++; + + return HT_OK; +} + +// Generic function for performing of adding, removing, setting and +// getting. +static int perform_operation(hashtable_t *ht, void *key, void *val, + void **keyptr, void **valptr, enum op op) +{ + bool resizing_in_progress; + + switch (do_resizing_related_stuff(ht, key, op)) + { + case REHASHING_IN_PROGRESS: + resizing_in_progress = true; + break; + case REHASHING_NOT_IN_PROGRESS: + resizing_in_progress = false; + break; + default: // case REHASHING_NO_MEM + return HT_NO_MEM; + } + + struct ht_node **bucket = + find_bucket(ht, key, resizing_in_progress); + + return perform_operation_on_bucket(ht, bucket, key, val, + keyptr, valptr, op); +} + +// The 4 functions below are the main part of the API. +int ht_get(hashtable_t *ht, void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, GET); +} + +int ht_add(hashtable_t *ht, void *key, void *val) +{ + return perform_operation(ht, key, val, NULL, NULL, ADD); +} + +int ht_set(hashtable_t *ht, void *key, void *val, + void **oldkey, void **oldval) +{ + return perform_operation(ht, key, val, oldkey, oldval, SET); +} + +int ht_rem(hashtable_t *ht, void *key, + void **storedkey, void **val) +{ + return perform_operation(ht, key, NULL, storedkey, val, REM); +} + +// As mentioned in hashtable.h, this func does not deallocate keys +// nor vals. One could use ht_map() to free() them before calling this +// func. +void ht_destroy(hashtable_t *ht) +{ + if (!ht->entries) return; + ht_finish_resizing(ht); + + struct ht_node **tab = ht->tab; + + for (ssize_t position = ht->tab_size - 1; + position >= 0; position--) + { + struct ht_node *pair = tab[position], *nextpair; + + while (pair) + { + nextpair = pair->next; + free(pair); + pair = nextpair; + } + } + + free(ht->tab); +} + +void ht_map(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)) +{ + if (!ht->entries) return; + ht_finish_resizing(ht); + + struct ht_node **tab = ht->tab; + + for (ssize_t position = ht->tab_size - 1; + position >= 0; position--) + + for (struct ht_node *pair = tab[position]; + pair; pair = pair->next) + + mapfunc(pair->key, pair->val, arg); +} + +void ht_map_destroy(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)) +{ + // If mapfunc() deallocates keys, then the next 2 lines require + // assumption on ht_destroy(), that it doesn't call ht->hashfunc() + // or ht->cmpfunc() on keys. + ht_map(ht, arg, mapfunc); + ht_destroy(ht); +} + +// These 2 functions are for easy making of hashtable with strings as +// keys. +size_t ht_string_hash(const char *key) +{ + size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4; + + do + { + char shift = ((unsigned char) key[i]) % sizeof(size_t); + hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift))) + ^ key[i]; + } + while (key[i++]); + + return hash; +} + +int ht_string_init(hashtable_t *ht) +{ + return ht_init(ht, (size_t (*)(void*)) &ht_string_hash, + (int (*)(void*, void*)) &strcmp); +} diff --git a/hashtable.h b/hashtable.h new file mode 100644 index 0000000..9c04c4c --- /dev/null +++ b/hashtable.h @@ -0,0 +1,109 @@ +// This is a separate chaining hashtable for general use. It's not +// universal: it uses malloc() and free(), so it requires a standard +// library to function and it's for single-threaded use only. It does, +// however, have one advantage: it rehashes automatically, both when +// it grows in size and when it shrinks, while retaining O(1) access +// time. A normal hashtable with rehashing would have amortized O(1) +// access time, but there would be single access with O(n) time +// complexity for each rehashing. In this hashtable, rehashing is done +// in parts. For example, a ht_add(), aside from adding an entry, +// might also rehash 4 other entries from old table to the new one and +// leave the rest unrehashed. +// Of course, it is assumed that a good hash function is provided +// by the programmer. If not, accesses may still degenerate to O(n). + +#include // for ssize_t + +// These are possible retrn values of some ht_ functions (see below). +#define HT_OK 0 +#define HT_NO_MEM -1 +#define HT_KEY_PRESENT -2 +#define HT_KEY_ABSENT -3 + +// Struct used to store the pair. Implementation detail. +struct ht_node +{ + void *key; + void *val; + struct ht_node *next; +}; + +typedef struct +{ + // All members are considered implementation details, except for + // "entries", which can be read, but should not be modified by + // external code. + size_t entries; + + // tab[] is where entries (chains of entries) are stored. + // When rehashing, newtab[] is also used. + struct ht_node **tab, **newtab; + + // sizes of tab[] and newtab[], obviously + size_t tab_size, new_size; + + size_t (*hashfunc)(void* key); + int (*cmpfunc)(void* key1, void *key2); + + // When no rehashing is taking place, rehashing_position is -1 + // (#define'd as NOT_REHASHING in hashtable.c). At any other time, + // rehashing_position is the lowest not yet rehashed position in + // the smaller table. + ssize_t rehashing_position; +} hashtable_t; + +// All int functions return 0 (#define'd as HT_OK) on success and in +// case of failure they return error codes, as described below. + +// May fail with HT_NO_MEM. +int ht_init(hashtable_t *ht, + size_t (*hash)(void* key), + int (*cmp)(void* key1, void *key2)); + +// May fail with HT_NO_MEM and HT_KEY_PRESENT. +int ht_add(hashtable_t *ht, void *key, void *val); + +// May fail with HT_NO_MEM. If key was not yet present in hashtable, +// *oldkey and *oldval are not modified. Otherwise, just-replaced pair +// is stored in them. +int ht_set(hashtable_t *ht, void *key, void *val, + void **oldkey, void **oldval); + +// If present, the looked for pair is stored in *storedkey and *val. +// Otherwise, they're not modified and HT_KEY_ABSENT is returned. +// storedkey and/or val can be NULL. +int ht_get(hashtable_t *ht, void *key, void **storedkey, void **val); + +// Works like the above but also removes the pair from ht if found. +int ht_rem(hashtable_t *ht, void *key, void **storedkey, void **val); + +// De-initializes the hashtable freeing all it's structures. +// The programmer is responsible for freeing keys and values if they +// were allocated from the heap (see ht_map_destroy() below). +void ht_destroy(hashtable_t *ht); + +// Calls ht_finish_resizing(), then maps through ht. +void ht_map(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)); + +// It might be tempting to use ht_map() to free() all keys and values +// stored in ht and then call ht_destroy(). If you think about it, +// ht_map() would leave hashtable in a broken state - with keys being +// deallocated. Depending on the implementation, ht_destroy() could +// cope with that, but we'd rather not guarrantee anything, so here's +// another function just for that - mapping through entries and +// destroying the hashtable immediately after, explicitly allowing +// the mapping function to dealocate keys. +void ht_map_destroy(hashtable_t *ht, void *arg, + void (*mapfunc)(void *key, void *val, void *arg)); + +// If hashtable is in the process of being rehashed, this function +// processes it to the end. Otherwise - it does nothing. +void ht_finish_resizing(hashtable_t *ht); + +// Included, since strings are commonly used as keys +size_t ht_string_hash(const char *key); + +// May fail with HT_NO_MEM. Initializes ht for use with string keys +// (using ht_string_hash() and strcmp()). +int ht_string_init(hashtable_t *ht); -- cgit v1.2.3