/** * part of Hydrilla * Routines for querying in-memory scriptbase, operating on data structures from * `scripbase.h'. * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ #include <stddef.h> #include <regex.h> #include <stdbool.h> #include <string.h> #include "hashtable.h" #include "string_buf.h" #include "scriptbase.h" #define MAX_URL_PATH_LEN 12 #define MAX_URL_PATH_CHARS 255 #define MAX_DOMAIN_LEN 7 #define MAX_DOMAIN_CHARS 100 const struct script *get_script(const char *name, struct scriptbase *base) { void *val; if (ht_get_threadsafe(&base->scripts, name, NULL, &val)) return NULL; return ((struct script*) val)->filled ? val : NULL; } const struct bag *get_bag(const char *name, struct scriptbase *base) { void *val; if (ht_get_threadsafe(&base->bags, name, NULL, &val)) return NULL; return ((struct bag*) val)->filled ? val : NULL; } const struct page *get_pattern(const char *pattern, struct scriptbase *base) { void *val = NULL; ht_get_threadsafe(&base->pages, pattern, NULL, &val); return val; } static const char url_regex[] = "^" "([a-zA-Z]{1,20}://)" /* protocol */ "([^/?#]{1,253})" /* domain */ "(/[^?#]*)?" /* path */ "\\\\?[^#]*" /* query */ "#?.*" /* target */ "$"; static regex_t url_regex_comp; static bool url_regex_ready; int init_url_lookup_regex(void) { int retval; retval = regcomp(&url_regex_comp, url_regex, REG_EXTENDED); url_regex_ready = !retval; return retval; } void destroy_url_lookup_regex(void) { if (!url_regex_ready) { fprintf(stderr, "Attempt to destroy uninitialized regex in " __FILE__ "\n"); return; } regfree(&url_regex_comp); url_regex_ready = false; } #define URL_REGEX_NMATCH 4 #define PROTOCOL_MATCH 1 #define DOMAIN_MATCH 2 #define PATH_MATCH 3 static int lookup_url_path(const char *path_begin, const char *path_end, struct stringbuf *buf, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { bool path_truncated = false; const char *segment_end = path_begin; int segments_allowed_left = MAX_URL_PATH_LEN; int segments_dropped = 0; int initial_len = buf->buf_filled; size_t len_path, previous_segment; void *val; bool trailing_dash = path_end != path_begin && path_end[-1] == '/'; int trailing_asterisks = 0; int result; if (path_end - path_begin > MAX_URL_PATH_CHARS) { path_truncated = true; path_end = path_begin + MAX_URL_PATH_CHARS; while (*path_end != '/') { if (--path_end == path_begin) break; } } while (true) { do { if (path_begin >= path_end) goto after_path_normalization; } while (*(path_begin++) == '/'); path_begin -= 2; if (!segments_allowed_left--) { path_truncated = true; break; } segment_end = path_begin + 1; while (*segment_end != '/' && ++segment_end < path_end); if (sb_bytes(buf, path_begin, segment_end - path_begin)) return -2; path_begin = segment_end; } after_path_normalization: #define TRY_WILDCARD(condition, wildcard) \ if (condition) { \ stringbuf_truncate(buf, len_path); \ if (sb_string(buf, wildcard)) \ return -2; \ \ result = ht_get_threadsafe(&base->pages, buf->buf, \ NULL, &val); \ if (!result && callback(val, data)) \ return 1; \ } while (true) { len_path = buf->buf_filled; previous_segment = len_path; while (previous_segment > initial_len && buf->buf[--previous_segment] != '/'); if (!trailing_asterisks) {/* only on first iteration */ if (!strcmp(buf->buf + previous_segment, "/*")) trailing_asterisks = 1; else if (!strcmp(buf->buf + previous_segment, "/***")) trailing_asterisks = 3; else trailing_asterisks = -1; } TRY_WILDCARD(segments_dropped == 0 && !path_truncated, ""); TRY_WILDCARD(segments_dropped == 0 && trailing_dash && !path_truncated, "/"); TRY_WILDCARD(segments_dropped == 1 && !path_truncated && trailing_asterisks != 1, "/*"); TRY_WILDCARD(segments_dropped > 1, "/**"); TRY_WILDCARD((segments_dropped != 1 || path_truncated || trailing_asterisks != 3), "/***"); stringbuf_truncate(buf, previous_segment); if (previous_segment == len_path) return 0; segments_dropped++; } #undef TRY_WILDCARD } static int lookup_url_domain(const char *domain_begin, const char *domain_end, const char *path_begin, const char *path_end, struct stringbuf *buf, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { bool domain_truncated = false; const char *label_start; int labels_allowed_left = MAX_DOMAIN_LEN; int labels_dropped = 0; int initial_len = buf->buf_filled; int result; #define TRY_WILDCARD(condition, wildcard) \ if (condition) { \ stringbuf_truncate(buf, initial_len); \ if (sb_string(buf, wildcard) || \ sb_bytes(buf, domain_begin, domain_end - domain_begin)) \ return -2; \ \ result = lookup_url_path(path_begin, path_end, \ buf, base, callback, data); \ if (result) \ return result; \ } if (domain_end - domain_begin > MAX_DOMAIN_CHARS) { domain_truncated = true; domain_begin = domain_end - MAX_DOMAIN_CHARS; while (domain_begin[-1] != '.') { if (++domain_begin == domain_end) return 0; } } for (label_start = domain_end; label_start > domain_begin; label_start--) { if (label_start[-1] == '.' && !--labels_allowed_left) break; } if (label_start != domain_begin) domain_truncated = true; else labels_allowed_left--; while (true) { domain_begin = label_start; while (*(label_start++) != '.') { if (label_start >= domain_end) return 0; } TRY_WILDCARD(labels_dropped == 0 && !domain_truncated, ""); TRY_WILDCARD(labels_dropped == 1 && !domain_truncated, "*."); TRY_WILDCARD(labels_dropped > 1, "**."); TRY_WILDCARD(true, "***."); labels_dropped++; } #undef TRY_WILDCARD } static int lookup_url_proto(const char *proto_begin, const char *proto_end, const char *domain_begin, const char *domain_end, const char *path_begin, const char *path_end, struct stringbuf *buf, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { if (sb_bytes(buf, proto_begin, proto_end - proto_begin)) return -2; return lookup_url_domain(domain_begin, domain_end, path_begin, path_end, buf, base, callback, data); } int lookup_url(const char *url, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { regmatch_t reg_matched[URL_REGEX_NMATCH]; struct stringbuf buf; const char *path_begin, *path_end; int retval; if (!url_regex_ready) { fprintf(stderr, "Regex not initialized in " __FILE__ "\n"); return -3; } printf("matching: %s\n", url); if (regexec(&url_regex_comp, url, URL_REGEX_NMATCH, reg_matched, 0) || reg_matched[DOMAIN_MATCH].rm_so == -1) return -1; stringbuf_init(&buf); path_begin = url + reg_matched[PATH_MATCH].rm_so; path_end = url + reg_matched[PATH_MATCH].rm_eo; if (path_begin == url - 1) { path_begin = NULL; path_end = NULL; } retval = lookup_url_proto(url + reg_matched[PROTOCOL_MATCH].rm_so, url + reg_matched[PROTOCOL_MATCH].rm_eo, url + reg_matched[DOMAIN_MATCH].rm_so, url + reg_matched[DOMAIN_MATCH].rm_eo, path_begin, path_end, &buf, base, callback, data); stringbuf_destroy(&buf); return retval; }