/** * part of Hydrilla * Routines for querying in-memory scriptbase, operating on data structures from * `scripbase.h'. * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ #include #include #include #include #include "hashtable.h" #include "string_buf.h" #include "scriptbase.h" const struct script *get_script(const char *name, struct scriptbase *base) { void *val; if (ht_get_threadsafe(&base->scripts, name, NULL, &val)) return NULL; return ((struct script*) val)->filled ? val : NULL; } const struct bag *get_bag(const char *name, struct scriptbase *base) { void *val; if (ht_get_threadsafe(&base->bags, name, NULL, &val)) return NULL; return ((struct bag*) val)->filled ? val : NULL; } const struct page *get_pattern(const char *pattern, struct scriptbase *base) { void *val = NULL; ht_get_threadsafe(&base->pages, pattern, NULL, &val); return val; } static const char url_regex[] = "^" "([a-zA-Z]{1,20}://)" /* protocol */ "([^/?#]{1,253})" /* domain */ "(/[^?#]*)?" /* path */ "\\\\?[^#]*" /* query */ "#?.*" /* target */ "$"; static regex_t url_regex_comp; static bool url_regex_ready; int init_url_lookup_regex(void) { int retval; retval = regcomp(&url_regex_comp, url_regex, REG_EXTENDED); url_regex_ready = !retval; return retval; } void destroy_url_lookup_regex(void) { if (!url_regex_ready) { fprintf(stderr, "Attempt to destroy uninitialized regex in " __FILE__ "\n"); return; } regfree(&url_regex_comp); } #define URL_REGEX_NMATCH 4 #define PROTOCOL_MATCH 1 #define DOMAIN_MATCH 2 #define PATH_MATCH 3 static int lookup_url_path(const char *path_begin, const char *path_end, struct stringbuf *buf, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { const char *segment_end = path_begin; int segments_dropped = 0; int initial_len = buf->buf_filled; size_t len_path, previous_segment; void *val; bool trailing_dash = path_end != path_begin && path_end[-1] == '/'; char asterisks[] = "/***"; int trailing_asterisks = 0, i; int result; while (true) { do { if (path_begin >= path_end) goto after_path_normalization; } while (*(path_begin++) == '/'); path_begin -= 2; segment_end = path_begin + 1; while (*segment_end != '/' && ++segment_end < path_end); if (sb_bytes(buf, path_begin, segment_end - path_begin)) return -2; path_begin = segment_end; } after_path_normalization: #define TRY_WILDCARD(condition, wildcard) \ if (condition) { \ stringbuf_truncate(buf, len_path); \ if (sb_string(buf, wildcard)) \ return -2; \ \ result = ht_get_threadsafe(&base->pages, buf->buf, \ NULL, &val); \ if (!result && callback(val, data)) \ return 1; \ } while (true) { len_path = buf->buf_filled; previous_segment = len_path; while (previous_segment > initial_len && buf->buf[--previous_segment] != '/'); if (!trailing_asterisks) {/* only on first iteration */ trailing_asterisks = -1; for (i = 3; i > 0; i--) { asterisks[i + 1] = '\0'; if (strncmp(buf->buf + previous_segment, asterisks, i + 1)) continue; trailing_asterisks = i; if (i != 3) break; if (buf->buf[previous_segment + i + 1] == '*') trailing_asterisks = -1; break; } } TRY_WILDCARD(segments_dropped == 0, ""); TRY_WILDCARD(segments_dropped == 0 && trailing_dash, "/"); TRY_WILDCARD(segments_dropped == 1 && trailing_asterisks != 1, "/*"); TRY_WILDCARD(segments_dropped > 1, "/**"); TRY_WILDCARD(segments_dropped > 0 && (segments_dropped > 1 || trailing_asterisks != 3), "/***"); stringbuf_truncate(buf, previous_segment); if (previous_segment == len_path) return 0; /* * We only ever care if this count is 0, 1 or > 1, * hence size_t is not necessary. */ if (segments_dropped < 2) segments_dropped++; } #undef TRY_WILDCARD } static int lookup_url_domain(const char *domain_begin, const char *domain_end, const char *path_begin, const char *path_end, struct stringbuf *buf, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { const char *next_label = domain_begin; int labels_dropped = 0; int initial_len = buf->buf_filled; int result; #define TRY_WILDCARD(condition, wildcard) \ if (condition) { \ stringbuf_truncate(buf, initial_len); \ if (sb_string(buf, wildcard) || \ sb_bytes(buf, domain_begin, domain_end - domain_begin)) \ return -2; \ \ result = lookup_url_path(path_begin, path_end, \ buf, base, callback, data); \ if (result) \ return result; \ } while (true) { domain_begin = next_label; while (*(next_label++) != '.') { if (next_label >= domain_end) return 0; } TRY_WILDCARD(labels_dropped == 0, ""); TRY_WILDCARD(labels_dropped == 1, "*."); TRY_WILDCARD(labels_dropped > 0, "**."); TRY_WILDCARD(true, "***."); labels_dropped++; } #undef TRY_WILDCARD } static int lookup_url_proto(const char *proto_begin, const char *proto_end, const char *domain_begin, const char *domain_end, const char *path_begin, const char *path_end, struct stringbuf *buf, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { if (sb_bytes(buf, proto_begin, proto_end - proto_begin)) return -2; return lookup_url_domain(domain_begin, domain_end, path_begin, path_end, buf, base, callback, data); } int lookup_url(const char *url, struct scriptbase *base, int (*callback)(struct page*, void*), void *data) { regmatch_t reg_matched[URL_REGEX_NMATCH]; struct stringbuf buf; const char *path_begin, *path_end; int retval; if (!url_regex_ready) { fprintf(stderr, "Regex not initialized in " __FILE__ "\n"); return -3; } printf("matching: %s\n", url); if (regexec(&url_regex_comp, url, URL_REGEX_NMATCH, reg_matched, 0) || reg_matched[DOMAIN_MATCH].rm_so == -1) return -1; stringbuf_init(&buf); path_begin = url + reg_matched[PATH_MATCH].rm_so; path_end = url + reg_matched[PATH_MATCH].rm_eo; if (path_begin == url - 1) { path_begin = NULL; path_end = NULL; } retval = lookup_url_proto(url + reg_matched[PROTOCOL_MATCH].rm_so, url + reg_matched[PROTOCOL_MATCH].rm_eo, url + reg_matched[DOMAIN_MATCH].rm_so, url + reg_matched[DOMAIN_MATCH].rm_eo, path_begin, path_end, &buf, base, callback, data); stringbuf_destroy(&buf); return retval; }