From 939c0c2e799734d46e3c3b784545f7c0c489c191 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Sat, 7 Aug 2021 16:58:11 +0200 Subject: migrate to Autotools --- src/scriptbase_query.c | 278 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 src/scriptbase_query.c (limited to 'src/scriptbase_query.c') diff --git a/src/scriptbase_query.c b/src/scriptbase_query.c new file mode 100644 index 0000000..fe9a910 --- /dev/null +++ b/src/scriptbase_query.c @@ -0,0 +1,278 @@ +/** + * part of Hydrilla + * Routines for querying in-memory scriptbase, operating on data structures from + * `scripbase.h'. + * + * Copyright (C) 2021 Wojtek Kosior + * Redistribution terms are gathered in the `copyright' file. + */ + +#include +#include +#include +#include + +#include "hashtable.h" +#include "string_buf.h" + +#include "scriptbase.h" + +const struct script *get_script(const char *name, struct scriptbase *base) +{ + void *val; + + if (ht_get_threadsafe(&base->scripts, name, NULL, &val)) + return NULL; + + return ((struct script*) val)->filled ? val : NULL; +} + +const struct bag *get_bag(const char *name, struct scriptbase *base) +{ + void *val; + + if (ht_get_threadsafe(&base->bags, name, NULL, &val)) + return NULL; + + return ((struct bag*) val)->filled ? val : NULL; +} + +const struct page *get_pattern(const char *pattern, struct scriptbase *base) +{ + void *val = NULL; + + ht_get_threadsafe(&base->pages, pattern, NULL, &val); + + return val; +} + +static const char url_regex[] = + "^" + "([a-zA-Z]{1,20}://)" /* protocol */ + "([^/?#]{1,253})" /* domain */ + "(/[^?#]*)?" /* path */ + "\\\\?[^#]*" /* query */ + "#?.*" /* target */ + "$"; + +static regex_t url_regex_comp; +static bool url_regex_ready; + +int init_url_lookup_regex(void) +{ + int retval; + + retval = regcomp(&url_regex_comp, url_regex, REG_EXTENDED); + + url_regex_ready = !retval; + + return retval; +} + +void destroy_url_lookup_regex(void) +{ + if (!url_regex_ready) { + fprintf(stderr, "Attempt to destroy uninitialized regex in " __FILE__ "\n"); + return; + } + + regfree(&url_regex_comp); +} + +#define URL_REGEX_NMATCH 4 + +#define PROTOCOL_MATCH 1 +#define DOMAIN_MATCH 2 +#define PATH_MATCH 3 + +static int lookup_url_path(const char *path_begin, const char *path_end, + struct stringbuf *buf, struct scriptbase *base, + int (*callback)(struct page*, void*), void *data) +{ + const char *segment_end = path_begin; + int segments_dropped = 0; + int initial_len = buf->buf_filled; + size_t len_path, previous_segment; + void *val; + bool trailing_dash = path_end != path_begin && path_end[-1] == '/'; + char asterisks[] = "/***"; + int trailing_asterisks = 0, i; + int result; + + while (true) { + do { + if (path_begin >= path_end) + goto after_path_normalization; + } while (*(path_begin++) == '/'); + path_begin -= 2; + + segment_end = path_begin + 1; + while (*segment_end != '/' && ++segment_end < path_end); + + if (sb_bytes(buf, path_begin, segment_end - path_begin)) + return -2; + + path_begin = segment_end; + } + +after_path_normalization: +#define TRY_WILDCARD(condition, wildcard) \ + if (condition) { \ + stringbuf_truncate(buf, len_path); \ + if (sb_string(buf, wildcard)) \ + return -2; \ + \ + result = ht_get_threadsafe(&base->pages, buf->buf, \ + NULL, &val); \ + if (!result && callback(val, data)) \ + return 1; \ + } + + while (true) { + len_path = buf->buf_filled; + previous_segment = len_path; + while (previous_segment > initial_len && + buf->buf[--previous_segment] != '/'); + + if (!trailing_asterisks) {/* only on first iteration */ + trailing_asterisks = -1; + + for (i = 3; i > 0; i--) { + asterisks[i + 1] = '\0'; + + if (strncmp(buf->buf + previous_segment, + asterisks, i + 1)) + continue; + + trailing_asterisks = i; + + if (i != 3) + break; + + if (buf->buf[previous_segment + i + 1] == '*') + trailing_asterisks = -1; + + break; + } + } + + TRY_WILDCARD(segments_dropped == 0, ""); + TRY_WILDCARD(segments_dropped == 0 && trailing_dash, "/"); + TRY_WILDCARD(segments_dropped == 1 && trailing_asterisks != 1, + "/*"); + TRY_WILDCARD(segments_dropped > 1, "/**"); + TRY_WILDCARD(segments_dropped > 0 && + (segments_dropped > 1 || trailing_asterisks != 3), + "/***"); + + stringbuf_truncate(buf, previous_segment); + + if (previous_segment == len_path) + return 0; + + /* + * We only ever care if this count is 0, 1 or > 1, + * hence size_t is not necessary. + */ + if (segments_dropped < 2) + segments_dropped++; + } + +#undef TRY_WILDCARD +} + +static int lookup_url_domain(const char *domain_begin, const char *domain_end, + const char *path_begin, const char *path_end, + struct stringbuf *buf, struct scriptbase *base, + int (*callback)(struct page*, void*), void *data) +{ + const char *next_label = domain_begin; + int labels_dropped = 0; + int initial_len = buf->buf_filled; + int result; + +#define TRY_WILDCARD(condition, wildcard) \ + if (condition) { \ + stringbuf_truncate(buf, initial_len); \ + if (sb_string(buf, wildcard) || \ + sb_bytes(buf, domain_begin, domain_end - domain_begin)) \ + return -2; \ + \ + result = lookup_url_path(path_begin, path_end, \ + buf, base, callback, data); \ + if (result) \ + return result; \ + } + + while (true) { + domain_begin = next_label; + + while (*(next_label++) != '.') { + if (next_label >= domain_end) + return 0; + } + + TRY_WILDCARD(labels_dropped == 0, ""); + TRY_WILDCARD(labels_dropped == 1, "*."); + TRY_WILDCARD(labels_dropped > 0, "**."); + TRY_WILDCARD(true, "***."); + + labels_dropped++; + } + +#undef TRY_WILDCARD +} + +static int lookup_url_proto(const char *proto_begin, const char *proto_end, + const char *domain_begin, const char *domain_end, + const char *path_begin, const char *path_end, + struct stringbuf *buf, struct scriptbase *base, + int (*callback)(struct page*, void*), void *data) +{ + if (sb_bytes(buf, proto_begin, proto_end - proto_begin)) + return -2; + + return lookup_url_domain(domain_begin, domain_end, path_begin, path_end, + buf, base, callback, data); +} + +int lookup_url(const char *url, struct scriptbase *base, + int (*callback)(struct page*, void*), void *data) +{ + regmatch_t reg_matched[URL_REGEX_NMATCH]; + struct stringbuf buf; + const char *path_begin, *path_end; + int retval; + + if (!url_regex_ready) { + fprintf(stderr, "Regex not initialized in " __FILE__ "\n"); + return -3; + } + + printf("matching: %s\n", url); + + if (regexec(&url_regex_comp, url, + URL_REGEX_NMATCH, reg_matched, 0) || + reg_matched[DOMAIN_MATCH].rm_so == -1) + return -1; + + stringbuf_init(&buf); + + path_begin = url + reg_matched[PATH_MATCH].rm_so; + path_end = url + reg_matched[PATH_MATCH].rm_eo; + if (path_begin == url - 1) { + path_begin = NULL; + path_end = NULL; + } + + retval = lookup_url_proto(url + reg_matched[PROTOCOL_MATCH].rm_so, + url + reg_matched[PROTOCOL_MATCH].rm_eo, + url + reg_matched[DOMAIN_MATCH].rm_so, + url + reg_matched[DOMAIN_MATCH].rm_eo, + path_begin, path_end, + &buf, base, callback, data); + + stringbuf_destroy(&buf); + + return retval; +} -- cgit v1.2.3