aboutsummaryrefslogtreecommitdiff
path: root/scriptbase_query.c
diff options
context:
space:
mode:
Diffstat (limited to 'scriptbase_query.c')
-rw-r--r--scriptbase_query.c278
1 files changed, 278 insertions, 0 deletions
diff --git a/scriptbase_query.c b/scriptbase_query.c
new file mode 100644
index 0000000..fe9a910
--- /dev/null
+++ b/scriptbase_query.c
@@ -0,0 +1,278 @@
+/**
+ * part of Hydrilla
+ * Routines for querying in-memory scriptbase, operating on data structures from
+ * `scripbase.h'.
+ *
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ */
+
+#include <stddef.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "hashtable.h"
+#include "string_buf.h"
+
+#include "scriptbase.h"
+
+const struct script *get_script(const char *name, struct scriptbase *base)
+{
+ void *val;
+
+ if (ht_get_threadsafe(&base->scripts, name, NULL, &val))
+ return NULL;
+
+ return ((struct script*) val)->filled ? val : NULL;
+}
+
+const struct bag *get_bag(const char *name, struct scriptbase *base)
+{
+ void *val;
+
+ if (ht_get_threadsafe(&base->bags, name, NULL, &val))
+ return NULL;
+
+ return ((struct bag*) val)->filled ? val : NULL;
+}
+
+const struct page *get_pattern(const char *pattern, struct scriptbase *base)
+{
+ void *val = NULL;
+
+ ht_get_threadsafe(&base->pages, pattern, NULL, &val);
+
+ return val;
+}
+
+static const char url_regex[] =
+ "^"
+ "([a-zA-Z]{1,20}://)" /* protocol */
+ "([^/?#]{1,253})" /* domain */
+ "(/[^?#]*)?" /* path */
+ "\\\\?[^#]*" /* query */
+ "#?.*" /* target */
+ "$";
+
+static regex_t url_regex_comp;
+static bool url_regex_ready;
+
+int init_url_lookup_regex(void)
+{
+ int retval;
+
+ retval = regcomp(&url_regex_comp, url_regex, REG_EXTENDED);
+
+ url_regex_ready = !retval;
+
+ return retval;
+}
+
+void destroy_url_lookup_regex(void)
+{
+ if (!url_regex_ready) {
+ fprintf(stderr, "Attempt to destroy uninitialized regex in " __FILE__ "\n");
+ return;
+ }
+
+ regfree(&url_regex_comp);
+}
+
+#define URL_REGEX_NMATCH 4
+
+#define PROTOCOL_MATCH 1
+#define DOMAIN_MATCH 2
+#define PATH_MATCH 3
+
+static int lookup_url_path(const char *path_begin, const char *path_end,
+ struct stringbuf *buf, struct scriptbase *base,
+ int (*callback)(struct page*, void*), void *data)
+{
+ const char *segment_end = path_begin;
+ int segments_dropped = 0;
+ int initial_len = buf->buf_filled;
+ size_t len_path, previous_segment;
+ void *val;
+ bool trailing_dash = path_end != path_begin && path_end[-1] == '/';
+ char asterisks[] = "/***";
+ int trailing_asterisks = 0, i;
+ int result;
+
+ while (true) {
+ do {
+ if (path_begin >= path_end)
+ goto after_path_normalization;
+ } while (*(path_begin++) == '/');
+ path_begin -= 2;
+
+ segment_end = path_begin + 1;
+ while (*segment_end != '/' && ++segment_end < path_end);
+
+ if (sb_bytes(buf, path_begin, segment_end - path_begin))
+ return -2;
+
+ path_begin = segment_end;
+ }
+
+after_path_normalization:
+#define TRY_WILDCARD(condition, wildcard) \
+ if (condition) { \
+ stringbuf_truncate(buf, len_path); \
+ if (sb_string(buf, wildcard)) \
+ return -2; \
+ \
+ result = ht_get_threadsafe(&base->pages, buf->buf, \
+ NULL, &val); \
+ if (!result && callback(val, data)) \
+ return 1; \
+ }
+
+ while (true) {
+ len_path = buf->buf_filled;
+ previous_segment = len_path;
+ while (previous_segment > initial_len &&
+ buf->buf[--previous_segment] != '/');
+
+ if (!trailing_asterisks) {/* only on first iteration */
+ trailing_asterisks = -1;
+
+ for (i = 3; i > 0; i--) {
+ asterisks[i + 1] = '\0';
+
+ if (strncmp(buf->buf + previous_segment,
+ asterisks, i + 1))
+ continue;
+
+ trailing_asterisks = i;
+
+ if (i != 3)
+ break;
+
+ if (buf->buf[previous_segment + i + 1] == '*')
+ trailing_asterisks = -1;
+
+ break;
+ }
+ }
+
+ TRY_WILDCARD(segments_dropped == 0, "");
+ TRY_WILDCARD(segments_dropped == 0 && trailing_dash, "/");
+ TRY_WILDCARD(segments_dropped == 1 && trailing_asterisks != 1,
+ "/*");
+ TRY_WILDCARD(segments_dropped > 1, "/**");
+ TRY_WILDCARD(segments_dropped > 0 &&
+ (segments_dropped > 1 || trailing_asterisks != 3),
+ "/***");
+
+ stringbuf_truncate(buf, previous_segment);
+
+ if (previous_segment == len_path)
+ return 0;
+
+ /*
+ * We only ever care if this count is 0, 1 or > 1,
+ * hence size_t is not necessary.
+ */
+ if (segments_dropped < 2)
+ segments_dropped++;
+ }
+
+#undef TRY_WILDCARD
+}
+
+static int lookup_url_domain(const char *domain_begin, const char *domain_end,
+ const char *path_begin, const char *path_end,
+ struct stringbuf *buf, struct scriptbase *base,
+ int (*callback)(struct page*, void*), void *data)
+{
+ const char *next_label = domain_begin;
+ int labels_dropped = 0;
+ int initial_len = buf->buf_filled;
+ int result;
+
+#define TRY_WILDCARD(condition, wildcard) \
+ if (condition) { \
+ stringbuf_truncate(buf, initial_len); \
+ if (sb_string(buf, wildcard) || \
+ sb_bytes(buf, domain_begin, domain_end - domain_begin)) \
+ return -2; \
+ \
+ result = lookup_url_path(path_begin, path_end, \
+ buf, base, callback, data); \
+ if (result) \
+ return result; \
+ }
+
+ while (true) {
+ domain_begin = next_label;
+
+ while (*(next_label++) != '.') {
+ if (next_label >= domain_end)
+ return 0;
+ }
+
+ TRY_WILDCARD(labels_dropped == 0, "");
+ TRY_WILDCARD(labels_dropped == 1, "*.");
+ TRY_WILDCARD(labels_dropped > 0, "**.");
+ TRY_WILDCARD(true, "***.");
+
+ labels_dropped++;
+ }
+
+#undef TRY_WILDCARD
+}
+
+static int lookup_url_proto(const char *proto_begin, const char *proto_end,
+ const char *domain_begin, const char *domain_end,
+ const char *path_begin, const char *path_end,
+ struct stringbuf *buf, struct scriptbase *base,
+ int (*callback)(struct page*, void*), void *data)
+{
+ if (sb_bytes(buf, proto_begin, proto_end - proto_begin))
+ return -2;
+
+ return lookup_url_domain(domain_begin, domain_end, path_begin, path_end,
+ buf, base, callback, data);
+}
+
+int lookup_url(const char *url, struct scriptbase *base,
+ int (*callback)(struct page*, void*), void *data)
+{
+ regmatch_t reg_matched[URL_REGEX_NMATCH];
+ struct stringbuf buf;
+ const char *path_begin, *path_end;
+ int retval;
+
+ if (!url_regex_ready) {
+ fprintf(stderr, "Regex not initialized in " __FILE__ "\n");
+ return -3;
+ }
+
+ printf("matching: %s\n", url);
+
+ if (regexec(&url_regex_comp, url,
+ URL_REGEX_NMATCH, reg_matched, 0) ||
+ reg_matched[DOMAIN_MATCH].rm_so == -1)
+ return -1;
+
+ stringbuf_init(&buf);
+
+ path_begin = url + reg_matched[PATH_MATCH].rm_so;
+ path_end = url + reg_matched[PATH_MATCH].rm_eo;
+ if (path_begin == url - 1) {
+ path_begin = NULL;
+ path_end = NULL;
+ }
+
+ retval = lookup_url_proto(url + reg_matched[PROTOCOL_MATCH].rm_so,
+ url + reg_matched[PROTOCOL_MATCH].rm_eo,
+ url + reg_matched[DOMAIN_MATCH].rm_so,
+ url + reg_matched[DOMAIN_MATCH].rm_eo,
+ path_begin, path_end,
+ &buf, base, callback, data);
+
+ stringbuf_destroy(&buf);
+
+ return retval;
+}