aboutsummaryrefslogtreecommitdiff
/**
 * part of Hydrilla
 * Routines for querying in-memory scriptbase, operating on data structures from
 * `scripbase.h'.
 *
 * Copyright (C) 2021 Wojtek Kosior
 * Redistribution terms are gathered in the `copyright' file.
 */

#include <stddef.h>
#include <regex.h>
#include <stdbool.h>
#include <string.h>

#include "hashtable.h"
#include "string_buf.h"

#include "scriptbase.h"

#define MAX_URL_PATH_LEN 12
#define MAX_URL_PATH_CHARS 255
#define MAX_DOMAIN_LEN 7
#define MAX_DOMAIN_CHARS 100

const struct script *get_script(const char *name, struct scriptbase *base)
{
	void *val;

	if (ht_get_threadsafe(&base->scripts, name, NULL, &val))
		return NULL;

	return ((struct script*) val)->filled ? val : NULL;
}

const struct bag *get_bag(const char *name, struct scriptbase *base)
{
	void *val;

	if (ht_get_threadsafe(&base->bags, name, NULL, &val))
		return NULL;

	return ((struct bag*) val)->filled ? val : NULL;
}

const struct page *get_pattern(const char *pattern, struct scriptbase *base)
{
	void *val = NULL;

	ht_get_threadsafe(&base->pages, pattern, NULL, &val);

	return val;
}

static const char url_regex[] =
	"^"
	"([a-zA-Z]{1,20}://)" /* protocol */
	"([^/?#]{1,253})"     /* domain */
	"(/[^?#]*)?"          /* path */
	"\\\\?[^#]*"          /* query */
	"#?.*"                /* target */
	"$";

static regex_t url_regex_comp;
static bool url_regex_ready;

int init_url_lookup_regex(void)
{
	int retval;

	retval = regcomp(&url_regex_comp, url_regex, REG_EXTENDED);

	url_regex_ready = !retval;

	return retval;
}

void destroy_url_lookup_regex(void)
{
	if (!url_regex_ready) {
		fprintf(stderr, "Attempt to destroy uninitialized regex in " __FILE__ "\n");
		return;
	}

	regfree(&url_regex_comp);
	url_regex_ready = false;
}

#define URL_REGEX_NMATCH 4

#define PROTOCOL_MATCH 1
#define DOMAIN_MATCH 2
#define PATH_MATCH 3

static int lookup_url_path(const char *path_begin, const char *path_end,
			   struct stringbuf *buf, struct scriptbase *base,
			   int (*callback)(struct page*, void*), void *data)
{
	bool path_truncated = false;
	const char *segment_end = path_begin;
	int segments_allowed_left = MAX_URL_PATH_LEN;
	int segments_dropped = 0;
	int initial_len = buf->buf_filled;
	size_t len_path, previous_segment;
	void *val;
	bool trailing_dash = path_end != path_begin && path_end[-1] == '/';
	int trailing_asterisks = 0;
	int result;

	if (path_end - path_begin > MAX_URL_PATH_CHARS) {
		path_truncated = true;
		path_end = path_begin + MAX_URL_PATH_CHARS;
		while (*path_end != '/') {
			if (--path_end == path_begin)
				break;
		}
	}

	while (true) {
		do {
			if (path_begin >= path_end)
				goto after_path_normalization;
		} while (*(path_begin++) == '/');
		path_begin -= 2;

		if (!segments_allowed_left--) {
			path_truncated = true;
			break;
		}

		segment_end = path_begin + 1;
		while (*segment_end != '/' && ++segment_end < path_end);

		if (sb_bytes(buf, path_begin, segment_end - path_begin))
			return -2;

		path_begin = segment_end;
	}

after_path_normalization:
#define TRY_WILDCARD(condition, wildcard)				\
	if (condition) {						\
		stringbuf_truncate(buf, len_path);			\
		if (sb_string(buf, wildcard))				\
			return -2;					\
									\
		result = ht_get_threadsafe(&base->pages, buf->buf,	\
					   NULL, &val);			\
		if (!result && callback(val, data))			\
			return 1;					\
	}

	while (true) {
		len_path = buf->buf_filled;
		previous_segment = len_path;
		while (previous_segment > initial_len &&
		       buf->buf[--previous_segment] != '/');

		if (!trailing_asterisks) {/* only on first iteration */
			if (!strcmp(buf->buf + previous_segment, "/*"))
				trailing_asterisks = 1;
			else if (!strcmp(buf->buf + previous_segment, "/***"))
				trailing_asterisks = 3;
			else
				trailing_asterisks = -1;
		}

		TRY_WILDCARD(segments_dropped == 0 && !path_truncated, "");
		TRY_WILDCARD(segments_dropped == 0 && trailing_dash &&
			     !path_truncated, "/");
		TRY_WILDCARD(segments_dropped == 1 && !path_truncated &&
			     trailing_asterisks != 1, "/*");
		TRY_WILDCARD(segments_dropped > 1, "/**");
		TRY_WILDCARD((segments_dropped != 1 || path_truncated ||
			      trailing_asterisks != 3), "/***");

		stringbuf_truncate(buf, previous_segment);

		if (previous_segment == len_path)
			return 0;

		segments_dropped++;
	}

#undef TRY_WILDCARD
}

static int lookup_url_domain(const char *domain_begin, const char *domain_end,
			     const char *path_begin, const char *path_end,
			     struct stringbuf *buf, struct scriptbase *base,
			     int (*callback)(struct page*, void*), void *data)
{
	bool domain_truncated = false;
	const char *label_start;
	int labels_allowed_left = MAX_DOMAIN_LEN;
	int labels_dropped = 0;
	int initial_len = buf->buf_filled;
	int result;

#define TRY_WILDCARD(condition, wildcard)				\
	if (condition) {						\
		stringbuf_truncate(buf, initial_len);			\
		if (sb_string(buf, wildcard) ||				\
		    sb_bytes(buf, domain_begin, domain_end - domain_begin)) \
			return -2;					\
									\
		result = lookup_url_path(path_begin, path_end,		\
					 buf, base, callback, data);	\
		if (result)						\
			return result;					\
	}

	if (domain_end - domain_begin > MAX_DOMAIN_CHARS) {
		domain_truncated = true;
		domain_begin = domain_end - MAX_DOMAIN_CHARS;
		while (domain_begin[-1] != '.') {
			if (++domain_begin == domain_end)
				return 0;
		}
	}

	for (label_start = domain_end;
	     label_start > domain_begin;
	     label_start--) {
		if (label_start[-1] == '.' && !--labels_allowed_left)
			break;
	}
	if (label_start != domain_begin)
		domain_truncated = true;
	else
		labels_allowed_left--;

	while (true) {
		domain_begin = label_start;

		while (*(label_start++) != '.') {
			if (label_start >= domain_end)
				return 0;
		}

		TRY_WILDCARD(labels_dropped == 0 && !domain_truncated, "");
		TRY_WILDCARD(labels_dropped == 1 && !domain_truncated, "*.");
		TRY_WILDCARD(labels_dropped > 1, "**.");
		TRY_WILDCARD(true, "***.");

		labels_dropped++;
	}

#undef TRY_WILDCARD
}

static int lookup_url_proto(const char *proto_begin, const char *proto_end,
			    const char *domain_begin, const char *domain_end,
			    const char *path_begin, const char *path_end,
			    struct stringbuf *buf, struct scriptbase *base,
			    int (*callback)(struct page*, void*), void *data)
{
	if (sb_bytes(buf, proto_begin, proto_end - proto_begin))
		return -2;

	return lookup_url_domain(domain_begin, domain_end, path_begin, path_end,
				 buf, base, callback, data);
}

int lookup_url(const char *url, struct scriptbase *base,
	       int (*callback)(struct page*, void*), void *data)
{
	regmatch_t reg_matched[URL_REGEX_NMATCH];
	struct stringbuf buf;
	const char *path_begin, *path_end;
	int retval;

	if (!url_regex_ready) {
		fprintf(stderr, "Regex not initialized in " __FILE__ "\n");
		return -3;
	}

	printf("matching: %s\n", url);

	if (regexec(&url_regex_comp, url,
		    URL_REGEX_NMATCH, reg_matched, 0) ||
	    reg_matched[DOMAIN_MATCH].rm_so == -1)
		return -1;

	stringbuf_init(&buf);

	path_begin = url + reg_matched[PATH_MATCH].rm_so;
	path_end = url + reg_matched[PATH_MATCH].rm_eo;
	if (path_begin == url - 1) {
		path_begin = NULL;
		path_end = NULL;
	}

	retval = lookup_url_proto(url + reg_matched[PROTOCOL_MATCH].rm_so,
				  url + reg_matched[PROTOCOL_MATCH].rm_eo,
				  url + reg_matched[DOMAIN_MATCH].rm_so,
				  url + reg_matched[DOMAIN_MATCH].rm_eo,
				  path_begin, path_end,
				  &buf, base, callback, data);

	stringbuf_destroy(&buf);

	return retval;
}