aboutsummaryrefslogtreecommitdiff
/**
 * C hashtable implementation
 *
 * Copyright (C) 2021 Wojtek Kosior
 * Redistribution terms are gathered in the `copyright' file.
 */

/*
 * GENERAL INFO
 *
 * You might want to read the beginning of hashtable.h first.
 *
 * In some places "rehashing" and in other ones "resizing" seemed to
 * be the right word to use. They mean more or less the same.
 *
 * Functions starting with ht_ are part of the API. Internal functions
 * are declared static. I also made some of them inline (either
 * because they were extremely short or only called from 1 place).
 *
 * Hashtable size is always a power of 2.
 *
 * When the hashtable is ¾ full, a new, 2x bigger table is allocated
 * and whenever one of 4 basic operations (adding, removing, setting,
 * getting) occurs, 4 slots are being rehashed from old table into 8
 * slots in new table. Similarly, when hashtable is ¼ full, a new,
 * 2x smaller table is allocated and each of subsequent operations
 * rehashes 8 entries from old table into 4 in new table.
 * This mechanism has been made lazier: getting and removing don't
 * trigger growing of ht even if it's 3/4 full. Similarly, getting,
 * setting and adding don't trigger shrinking.
 * Once resizing is triggered, however, any of the operations will
 * contribute to rehashing. Even if, for example, the operation is
 * ADD and the table is being shrinked.
 * This means, that if we have a hashtable of size n which is ¾ full
 * and growing is triggered, then each subsequent call to
 * ht_{add,rem,get,set}() rehashes some entries and, depending on
 * how frequently and how successfully each of these 4 funcs was
 * called, at the end of resizing we get a size 2n hashtable which is
 * between ¼ and ½ full. Similarly, if shrinking of a ¼ full
 * hashtable of size n is triggered, then after some operations we
 * get a size ½n hashtable, that is somewhere between ¼ and ¾ full.
 * One can see now, that we always keep the hashtable between ¼ and ¾
 * full (with the exception of a minimal size one, that can be empty).
 */

#include "hashtable.h"

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdbool.h>

#ifdef _USE_INLINE
#define INLINE inline
#else
#define INLINE
#endif

/*
 * We won't shrink hashtable below this size. Newly created one will
 * be this big.
 */
#define MIN_SIZE 4

/* Special value of ht->rehashing_position. */
#define NOT_REHASHING ((ssize_t) -1)

/*
 * Those are possible return values of do_resizing_related_stuff()
 * and rehash_some_entries() (which only returns the first 2).
 */
#define REHASHING_IN_PROGRESS     0
#define REHASHING_NOT_IN_PROGRESS 1
#define REHASHING_NO_MEM          2

/* Struct used to store a pair. */
struct ht_node
{
  const void *key;
  const void *val;
  struct ht_node *next;
};

enum op
  {
    GET,
    GET_THREADSAFE,
    ADD,
    SET,
    REM,
  };

int ht_init(hashtable_t *ht,
	    size_t (*hash)(const void *key),
	    int (*cmp)(const void *key1, const void *key2))
{
  if (!(ht->tab = calloc(MIN_SIZE, sizeof(struct ht_node**))))
    return HT_NO_MEM;

  ht->tab_size = MIN_SIZE;
  ht->rehashing_position = NOT_REHASHING;
  ht->entries = 0;
  ht->hashfunc = hash;
  ht->cmpfunc = cmp;

  return HT_OK;
}

/* First come some utilities :) */

static INLINE size_t min(size_t n1, size_t n2)
{
  return n1 < n2 ? n1 : n2;
}

static INLINE size_t hash2(size_t n)
{
  /* I found this "hash improver" on the internet. */
  n ^= (n >> 20) ^ (n >> 12);
  return n ^ (n >> 7) ^ (n >> 4);
}

/* Below are 2 list-handling utility functions. */
static INLINE struct ht_node *join_lists(struct ht_node *l1,
					 struct ht_node *l2)
{
  if (!l1) return l2;
  if (!l2) return l1;

  struct ht_node *l1_last;

  for (l1_last = l1; l1_last->next; l1_last = l1_last->next);

  /* Append l2 to the end of l1. */
  l1_last->next = l2;

  /* For convenience return the first element of the resulting list. */
  return l1;
}


static INLINE void push(struct ht_node *node, struct ht_node **list)
{
  node->next = *list;
  *list = node;
}

/*
 * The following 2 rehash_* functions are helpers of rehash_some_entries().
 * This func rehashes 1 chain of entries in tab[] into 2 chains in newtab[].
 */
static INLINE void rehash_position_growing(hashtable_t *ht)
{
  /* There are 2 possible new positions of an entry in a 2x bigger ht. */
  struct ht_node *list0 = NULL, *list1 = NULL;

  size_t old_position = ht->rehashing_position,
    new_position0 = old_position,
    new_position1 = old_position | ht->tab_size;

  struct ht_node *pair = ht->tab[old_position], *next_pair;

  while (pair)
    {
      next_pair = pair->next;

      size_t new_position = hash2(ht->hashfunc(pair->key))
	& (ht->new_size - 1);

      push(pair, new_position == new_position1 ? &list1 : &list0);

      pair = next_pair;
    }

  ht->newtab[new_position0] = list0;
  ht->newtab[new_position1] = list1;

  ht->rehashing_position++;
}

/* This func rehashes 2 chains of entries in tab[] into 1 chain in newtab[]. */
static INLINE void rehash_2positions_shrinking(hashtable_t *ht)
{
  size_t new_position = ht->rehashing_position,
    old_position0 = new_position,
    old_position1 = new_position | ht->new_size;

  ht->newtab[new_position] = join_lists(ht->tab[old_position0],
					ht->tab[old_position1]);

  ht->rehashing_position++;
}

/*
 * Rehashes 4(8) positions from tab to newtab. If those were the last
 * enetries to rehash, the function takes care of everything
 * (like deallocating old tab) and returns REHASHING_NOT_IN_PROGRESS.
 * Otherwise, returns REHASHING_IN_PROGRESS.
 * Caller must make sure rehashing was started b4 calling this func.
 */
static int rehash_some_entries(hashtable_t *ht)
{
  int rehashes_left = 4;

  if (ht->new_size > ht->tab_size)                    /* growing ht */
    {
      while(rehashes_left--) rehash_position_growing(ht);
      if (ht->rehashing_position != ht->tab_size)
	return REHASHING_IN_PROGRESS;
    }
  else                                                /* shrinking ht */
    {
      while(rehashes_left--) rehash_2positions_shrinking(ht);
      if (ht->rehashing_position != ht->new_size)
	return REHASHING_IN_PROGRESS;
    }

  /* rehashing finishes */
  ht->rehashing_position = NOT_REHASHING;
  ht->tab_size = ht->new_size;
  free(ht->tab);
  ht->tab = ht->newtab;

  return REHASHING_NOT_IN_PROGRESS;
}

static INLINE bool resizing_taking_place(hashtable_t *ht)
{
  return !(ht->rehashing_position == NOT_REHASHING);
}

void ht_finish_resizing(hashtable_t *ht)
{
  if (resizing_taking_place(ht))
    while (rehash_some_entries(ht) == REHASHING_IN_PROGRESS);
}

static INLINE bool needs_growing(hashtable_t *ht)
{
  return ht->entries >= 3 * ht->tab_size / 4;
}

static INLINE bool needs_shrinking(hashtable_t *ht)
{
  return ht->tab_size > MIN_SIZE
    && ht->entries <= ht->tab_size / 4;
}
/*
 * Each of hashtable operations (add, set, rem, get) should also
 * attempt to do part of resizing. This way resizing operation
 * which is O(n) is distributed among many hashtable accesses
 * each of them still being O(1). Without this the the amortized
 * complexity of ht accesses would still be O(1), but a single access
 * would sometimes be O(n).
 * Other function that adds, sets, gets or removes sth from ht uses
 * this one to do this "part of resizing" mentioned above.
 * This func returns REHASHING_NO_MEM on failed malloc (won't happen
 * for GET or REM operation) and REHASHING_[NOT_]IN_PROGRESS otherwise.
 */
static INLINE int
do_resizing_related_stuff(hashtable_t *ht, const void *key, enum op op)
{
  bool resizing = resizing_taking_place(ht);

  if (!resizing)
    {
      size_t new_size;

      switch (op)
	{
	case GET:
	  goto dont_start_resizing;
	case ADD:
	case SET:
	  if (needs_growing(ht))
	    new_size = ht->tab_size * 2;
	  else
	    goto dont_start_resizing;
	  break;
	default: /* case REM */
	  if (needs_shrinking(ht))
	    new_size = ht->tab_size / 2;
	  else
	    goto dont_start_resizing;
	}

      struct ht_node **newtab;
      if (!(newtab = malloc(new_size * sizeof(struct ht_node*))))
	return op == REM ? REHASHING_NOT_IN_PROGRESS : REHASHING_NO_MEM;

      ht->newtab = newtab;
      ht->new_size = new_size;
      ht->rehashing_position = 0;

      resizing = true;
    }

 dont_start_resizing:

  return resizing ?
    rehash_some_entries(ht) : REHASHING_NOT_IN_PROGRESS;
}

/*
 * This is a chaining hashtable, so each element in the array (table)
 * is actually a list of entries. All operations (adding, removing,
 * etc.) need to find the right list of entries (here called "bucket")
 * for a given key first, so it makes sense to do it in a separate
 * function. The bucket may be in tab or newtab if resizing is taking
 * place. Being informed by the caller if resizing is in progress,
 * this func does not need to check for it by itself.
 */
static INLINE struct ht_node **find_bucket(hashtable_t *ht, const void *key,
					   bool resizing_in_progress)
{
  size_t hash = hash2(ht->hashfunc(key)),
    destination_tab_size, position;

  struct ht_node **destination_tab;

  if (resizing_in_progress)
    /*
     * Here we must check whether our key's bucket is still
     * in ht->tab or already rehashed to ht->newtab.
     */
    {
      size_t smaller_tab_size = min(ht->tab_size, ht->new_size),
	smaller_tab_position = hash & (smaller_tab_size - 1);

      if (smaller_tab_position < ht->rehashing_position)
	{
	  destination_tab = ht->newtab;
	  destination_tab_size = ht->new_size;
	}
      else
	{
	  destination_tab = ht->tab;
	  destination_tab_size = ht->tab_size;
	}
    }
  else
    /* In this case we know, we're working on ht->tab and not newtab. */
    {
      destination_tab = ht->tab;
      destination_tab_size = ht->tab_size;
    }

  position = hash & (destination_tab_size - 1);
  return &destination_tab[position];
}

/*
 * Operations of adding, removing, etc. all work on list of entries
 * (bucket) to wchich key hashes and they have some common logic, so
 * it made sense to make a single function, that does the right
 * operation based on an enum passed to it.
 */
static INLINE int
perform_operation_on_bucket(hashtable_t *ht, struct ht_node **bucket,
			    const void *key, const void *val,
			    void **keyptr, void **valptr,
			    enum op op)
{
  struct ht_node **pairptr, *pair;

  for (pairptr = bucket, pair = *pairptr;
       pair;
       pairptr = &pair->next, pair = pair->next)

    if (!ht->cmpfunc(key, pair->key))
      {
	if (op == ADD)
	  return HT_KEY_PRESENT;

	if (keyptr) *keyptr = (void*) pair->key;
	if (valptr) *valptr = (void*) pair->val;

	switch (op)
	  {
	  case GET:
	  case GET_THREADSAFE:
	    {
	      return HT_OK;
	    }
	  case SET:
	    {
	      pair->key = key;
	      pair->val = val;
	      return HT_OK;
	    }
	  default: /* case REM */
	    {
	      *pairptr = pair->next;
	      free(pair);
	      ht->entries--;
	      return HT_OK;
	    }
	  }
      }

  if (op == GET || op == GET_THREADSAFE || op == REM)
    return HT_KEY_ABSENT;

  /* op == ADD || op == SET */

  struct ht_node *new_pair = malloc(sizeof(struct ht_node));
  if (!new_pair)
    return HT_NO_MEM;

  *new_pair = (struct ht_node) {.key = key, .val = val};
  push(new_pair, bucket);
  ht->entries++;

  return HT_OK;
}

/* Generic function for performing of adding, removing, setting and getting. */
static int perform_operation(hashtable_t *ht, const void *key, const void *val,
			     void **keyptr, void **valptr, enum op op)
{
  bool resizing_in_progress;

  if (op == GET_THREADSAFE) {
    resizing_in_progress = resizing_taking_place(ht);
    goto skip_resizing;
  }

  switch (do_resizing_related_stuff(ht, key, op))
    {
    case REHASHING_IN_PROGRESS:
      resizing_in_progress = true;
      break;
    case REHASHING_NOT_IN_PROGRESS:
      resizing_in_progress = false;
      break;
    default: /* case REHASHING_NO_MEM */
      return HT_NO_MEM;
    }

  struct ht_node **bucket;

 skip_resizing:
  bucket = find_bucket(ht, key, resizing_in_progress);

  return perform_operation_on_bucket(ht, bucket, key, val,
				     keyptr, valptr, op);
}

/* The 5 functions below are the main part of the API. */
int ht_get(hashtable_t *ht, const void *key,
	   void **storedkey, void **val)
{
  return perform_operation(ht, key, NULL, storedkey, val, GET);
}

int ht_get_threadsafe(hashtable_t *ht, const void *key,
		      void **storedkey, void **val)
{
  return perform_operation(ht, key, NULL, storedkey, val,
			   GET_THREADSAFE);
}

int ht_add(hashtable_t *ht, const void *key, const void *val)
{
  return perform_operation(ht, key, val, NULL, NULL, ADD);
}

int ht_set(hashtable_t *ht, const void *key, const void *val,
	   void **oldkey, void **oldval)
{
  return perform_operation(ht, key, val, oldkey, oldval, SET);
}

int ht_rem(hashtable_t *ht, const void *key,
	   void **storedkey, void **val)
{
  return perform_operation(ht, key, NULL, storedkey, val, REM);
}

/*
 * As mentioned in hashtable.h, this func does not deallocate keys
 * nor vals. One could use ht_map_destroy() if that is needed.
 */
void ht_destroy(hashtable_t *ht)
{
  ssize_t position;

  if (!ht->entries)
    goto free_tab;

  ht_finish_resizing(ht);

  struct ht_node **tab = ht->tab;

  for (position = ht->tab_size - 1; position >= 0; position--)
    {
      struct ht_node *pair = tab[position], *nextpair;

      while (pair)
	{
	  nextpair = pair->next;
	  free(pair);
	  pair = nextpair;
	}
    }

 free_tab:
  free(ht->tab);
}

void ht_map(hashtable_t *ht, void *arg,
	    void (*mapfunc)(const void *key, void *val, void *arg))
{
  ssize_t position;

  if (!ht->entries)
    return;

  ht_finish_resizing(ht);

  struct ht_node **tab = ht->tab, *pair;

  for (position = ht->tab_size - 1; position >= 0; position--)
    {
      for (pair = tab[position]; pair; pair = pair->next)
	mapfunc(pair->key, (void*) pair->val, arg);
    }
}

void ht_map_destroy(hashtable_t *ht, void *arg,
		    void (*mapfunc)(void *key, void *val, void *arg))
{
  /*
   * If mapfunc() deallocates keys, the following 2 lines make
   * assumption on ht_destroy(), that it doesn't call ht->hashfunc()
   * or ht->cmpfunc() on keys.
   */
  ht_map(ht, arg, (void (*)(const void*, void*, void*)) mapfunc);
  ht_destroy(ht);
}

/*
 * These 2 functions are for easy making of hashtable with strings as
 * keys. Note that this hash is *not* secure against DoS attacks.
 */
size_t ht_string_hash(const char *key)
{
  size_t i = 0, hash = (size_t) 0xa1bad2dead3beef4;

  do
    {
      char shift = ((unsigned char) key[i]) % sizeof(size_t);
      hash += ((hash >> shift) | (hash << (sizeof(size_t) - shift)))
	^ key[i];
    }
  while (key[i++]);

  return hash;
}

int ht_string_init(hashtable_t *ht)
{
  return ht_init(ht, (size_t (*)(const void*)) &ht_string_hash,
		 (int (*)(const void*, const void*)) &strcmp);
}