aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWojtek Kosior <koszko@koszko.org>2021-11-24 15:53:00 +0100
committerWojtek Kosior <koszko@koszko.org>2021-11-30 19:33:22 +0100
commit69e53743393664ed2db59bbe9bbeaf6f124754f3 (patch)
tree75a07a47899bb202204f2a9fa8850954fbb5e023
parent6106c789ee818fd18240fd3f99eead598406852f (diff)
downloadbrowser-extension-69e53743393664ed2db59bbe9bbeaf6f124754f3.tar.gz
browser-extension-69e53743393664ed2db59bbe9bbeaf6f124754f3.zip
implement more efficient querying of URL patterns
-rw-r--r--common/patterns.js47
-rw-r--r--common/patterns_query_tree.js293
2 files changed, 321 insertions, 19 deletions
diff --git a/common/patterns.js b/common/patterns.js
index 625be05..790ce1b 100644
--- a/common/patterns.js
+++ b/common/patterns.js
@@ -7,10 +7,12 @@
* Redistribution terms are gathered in the `copyright' file.
*/
-const MAX_URL_PATH_LEN = 12;
-const MAX_URL_PATH_CHARS = 255;
-const MAX_DOMAIN_LEN = 7;
-const MAX_DOMAIN_CHARS = 100;
+const MAX = {
+ URL_PATH_LEN: 12,
+ URL_PATH_CHARS: 255,
+ DOMAIN_LEN: 7,
+ DOMAIN_CHARS: 100
+};
const proto_regex = /^(\w+):\/\/(.*)$/;
@@ -25,11 +27,17 @@ const file_regex = new RegExp(`^(${path_re}).*`);
const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);
-function deconstruct_url(url)
+function deconstruct_url(url, use_limits=true)
{
+ const max = MAX;
+ if (!use_limits) {
+ for (key in MAX)
+ max[key] = Infinity;
+ }
+
const proto_match = proto_regex.exec(url);
if (proto_match === null)
- return undefined;
+ throw `bad url '${url}'`;
const deco = {proto: proto_match[1]};
@@ -37,20 +45,21 @@ function deconstruct_url(url)
deco.path = file_regex.exec(proto_match[2])[1];
} else if (deco.proto === "ftp") {
[deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4);
- } else {
+ } else if (deco.proto === "http" || deco.proto === "https") {
const http_match = http_regex.exec(proto_match[2]);
if (!http_match)
return undefined;
[deco.domain, deco.path, deco.query] = http_match.slice(1, 4);
+ } else {
+ throw `unsupported protocol in url '${url}'`;
}
- const leading_dash = deco.path[0] === "/";
deco.trailing_dash = deco.path[deco.path.length - 1] === "/";
if (deco.domain) {
- if (deco.domain.length > MAX_DOMAIN_CHARS) {
+ if (deco.domain.length > max.DOMAIN_CHARS) {
const idx = deco.domain.indexOf(".", deco.domain.length -
- MAX_DOMAIN_CHARS);
+ max.DOMAIN_CHARS);
if (idx === -1)
deco.domain = [];
else
@@ -59,7 +68,7 @@ function deconstruct_url(url)
deco.domain_truncated = true;
}
- if (deco.path.length > MAX_URL_PATH_CHARS) {
+ if (deco.path.length > max.URL_PATH_CHARS) {
deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
deco.path_truncated = true;
}
@@ -67,16 +76,14 @@ function deconstruct_url(url)
if (typeof deco.domain === "string") {
deco.domain = deco.domain.split(".");
- if (deco.domain.splice(0, deco.domain.length - MAX_DOMAIN_LEN).length
+ if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length
> 0)
deco.domain_truncated = true;
}
deco.path = deco.path.split("/").filter(s => s !== "");
- if (deco.domain && deco.path.splice(MAX_URL_PATH_LEN).length > 0)
+ if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0)
deco.path_truncated = true;
- if (leading_dash || deco.path.length === 0)
- deco.path.unshift("");
return deco;
}
@@ -98,13 +105,14 @@ function* each_domain_pattern(deco)
function* each_path_pattern(deco)
{
- for (let slice = deco.path.length; slice > 0; slice--) {
- const path_part = deco.path.slice(0, slice).join("/");
+ for (let slice = deco.path.length; slice >= 0; slice--) {
+ const path_part = ["", ...deco.path.slice(0, slice)].join("/");
const path_wildcards = [];
if (slice === deco.path.length && !deco.path_truncated) {
- if (deco.trailing_dash)
+ if (deco.trailing_dash && path_part !== )
yield path_part + "/";
- yield path_part;
+ if (part_part !== "" || deco.proto !== "file")
+ yield path_part;
}
if (slice === deco.path.length - 1 && !deco.path_truncated &&
deco.path[slice] !== "*")
@@ -137,5 +145,6 @@ function* each_url_pattern(url)
/*
* EXPORTS_START
* EXPORT each_url_pattern
+ * EXPORT deconstruct_url
* EXPORTS_END
*/
diff --git a/common/patterns_query_tree.js b/common/patterns_query_tree.js
new file mode 100644
index 0000000..93f74c6
--- /dev/null
+++ b/common/patterns_query_tree.js
@@ -0,0 +1,293 @@
+/**
+ * This file is part of Haketilo.
+ *
+ * Function: Data structure to query items by URL patterns.
+ *
+ * Copyright (C) 2021 Wojtek Kosior
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * As additional permission under GNU GPL version 3 section 7, you
+ * may distribute forms of that code without the copy of the GNU
+ * GPL normally required by section 4, provided you include this
+ * license notice and, in case of non-source distribution, a URL
+ * through which recipients can access the Corresponding Source.
+ * If you modify file(s) with this exception, you may extend this
+ * exception to your version of the file(s), but you are not
+ * obligated to do so. If you do not wish to do so, delete this
+ * exception statement from your version.
+ *
+ * As a special exception to the GPL, any HTML file which merely
+ * makes function calls to this code, and for that purpose
+ * includes it by reference shall be deemed a separate work for
+ * copyright law purposes. If you modify this code, you may extend
+ * this exception to your version of the code, but you are not
+ * obligated to do so. If you do not wish to do so, delete this
+ * exception statement from your version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ *
+ * I, Wojtek Kosior, thereby promise not to sue for violation of this file's
+ * license. Although I request that you do not make use this code in a
+ * proprietary program, I am not going to enforce this in court.
+ */
+
+/*
+ * IMPORTS_START
+ * IMPORT deconstruct_url
+ * IMPORTS_END
+ */
+
+/*
+ * This is a javascript rewrite of Python implementation of the algorithm here:
+ * https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205
+ */
+
+/* "Pattern Tree" is how we refer to the data structure used for querying
+ * Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
+ * is to make it possible for given URL to quickly retrieve all known patterns
+ * that match it.
+ */
+function make_tree_node() {
+ return {
+ wildcard_matches: [null, null, null],
+ literal_match: null,
+ children: null
+ };
+}
+
+const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0;
+
+/*
+ * Yields all matches of this segments sequence against the tree that starts at
+ * this node. Results are produces in order from greatest to lowest pattern
+ * specificity.
+ */
+function* search_sequence(tree_node, segments)
+{
+ const nodes = [tree_node];
+
+ for (const segment of segments) {
+ const next_node = nodes[nodes.length - 1].children[segment];
+ if (next_node === undefined)
+ break;
+
+ nodes.push(next_node);
+ }
+
+ const nsegments = segments.length;
+
+ const conds = [
+ /* literal pattern match */
+ () => nodes.length == nsegments,
+ /* wildcard pattern matches */
+ () => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
+ () => nodes.length + 1 < nsegments,
+ () => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
+ ];
+
+ while (nodes.length) {
+ const node = nodes.pop();
+ const items = [node.literal_match, ...node.wildcard_matches];
+
+ for (let i = 0; i < 4; i++) {
+ if (items[i] !== null && conds[i]())
+ yield items[i];
+ }
+ }
+}
+
+/*
+ * Make item queryable through (this branch of) the Pattern Tree. If there was
+ * not yet any item associated with the tree path designated by segments, create
+ * a new one using item_instantiator() function. Return all items matching this
+ * path (both the ones that existed and the ones just created).
+ */
+function add_sequence(tree_node, segments, item_instantiator)
+{
+ let wildcards;
+
+ for (var segment of segments) {
+ wildcards = tree_node.wildcard_matches;
+
+ const child = tree_node.children[segment] || make_tree_node();
+ tree_node.children[segment] = child;
+ tree_node = child;
+ }
+
+ if (tree_node.literal_match === null)
+ tree_node.literal_match = item_instantiator();
+
+ if (!is_wildcard(segment))
+ return [tree_node.literal_match];
+
+ if (wildcards[segment.length - 1] === null)
+ wildcards[segment.length - 1] = item_instantiator();
+
+ return [tree_node.literal_match, wildcards[segment.length - 1]];
+}
+
+/*
+ * Can be used to remove item from (this branch of) the Pattern Tree.
+ * item_modifier should be a function that accepts 1 argument, the item stored
+ * in the tree, and returns an item that should be used in place of the first
+ * one. It is also legal for it to return the same item modifying it first. If
+ * it returns `undefined`, it means the item should be deleted from the Tree.
+ *
+ * item_modifier() will be called with all items in (this branch of) the Pattern
+ * Tree that correspond to the path designated by segments. The Tree will be
+ * updated accordingly.
+ */
+function modify_sequence(tree_node, segments, item_modifier)
+{
+ const nodes = [tree_node];
+
+ for (const segment of segments) {
+ const next_node = nodes[nodes.length - 1].children[segment];
+ nodes.push(next_node);
+ }
+
+ const nsegments = segments.length;
+ let i = segments.length - 1;
+
+ if (nsegments > 0 && is_wildcard(segments[i])) {
+ const node = nodes[nodes.length - 2];
+ const asterisks = segments[i].length;
+ }
+
+ ............................
+
+ for (let i = segments.length - 1; i >= 0; i--) {
+
+ if (i
+ }
+
+ segments
+
+ const conds = [
+ /* literal pattern match */
+ () => nodes.length == nsegments,
+ /* wildcard pattern matches */
+ () => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
+ () => nodes.length + 1 < nsegments,
+ () => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
+ ];
+
+ while (nodes.length) {
+ const node = nodes.pop();
+ const items = [node.literal_match, ...node.wildcard_matches];
+
+ for (let i = 0; i < 4; i++) {
+ if (items[i] !== null && conds[i]())
+ yield items[i];
+ }
+ }
+ .................
+}
+
+/*
+ * Make item queryable through the Pattern Tree that starts with the protocols
+ * dictionary object passed in the first argument.
+ */
+function pattern_tree_register(patterns_by_proto, pattern, item_name, item)
+{
+ /*
+ * We pass 'false' to disable length limits on URL parts. Length limits are
+ * mostly useful in case of iteration over all patterns matching given URL.
+ * Here we don't do that.
+ */
+ const deco = deconstruct_url(pattern, false);
+
+ const tree = patterns_by_proto[deco.proto] || make_tree_node();
+ patterns_by_proto[deco.proto] = tree;
+
+ let path_trees;
+
+ if (deco.proto === "file") {
+ path_trees = [tree];
+ } else {
+ /* We need an aray of domain labels ordered most-significant-first. */
+ const domain = [...deco.domain].reverse();
+ path_trees = add_sequence(tree, domain, make_tree_node);
+ }
+
+ for (const path_tree of path_trees) {
+ for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
+ match_obj[item_name] = item;
+ }
+}
+
+/*
+ * Remove registered item from the Pattern Tree that starts with the protocols
+ * dictionary object passed in the first argument. The remaining 2 arguments
+ * should be pattern and name that have been earlier passed to
+ * pattern_tree_register().
+ */
+function pattern_tree_deregister(patterns_by_proto, pattern, item_name)
+{
+ const deco = deconstruct_url(pattern, false);
+
+ const tree = patterns_by_proto[deco.proto] || make_tree_node();
+ patterns_by_proto[deco.proto] = tree;
+
+ let path_trees;
+
+ if (deco.proto === "file") {
+ path_trees = [tree];
+ } else {
+ /* We need an aray of domain labels ordered most-significant-first. */
+ const domain = [...deco.domain].reverse();
+ ..........................
+ path_trees = add_sequence(tree, domain, make_tree_node);
+ }
+}
+
+/*
+ * Yield registered items (as [item_name, item] arrays) that match url. For
+ * cosistent behavior, please don't modify the pattern tree while iterating over
+ * the results.
+ */
+function* pattern_tree_search(patterns_by_proto, url)
+{
+ const deco = deconstruct_url(url, false);
+
+ tree = patterns_by_proto[deco.proto] || make_tree_node();
+
+ let path_trees;
+
+ if (deco.proto === "file") {
+ path_trees = [tree];
+ } else {
+ /* We need an aray of domain labels ordered most-significant-first. */
+ const domain = [...deco.domain].reverse();
+ path_trees = search_sequence(tree, domain);
+ }
+
+ for (const path_tree of path_trees) {
+ for (const match_obj of search_sequence(path_tree, deco.path)) {
+ for (const entry of Object.entries(match_obj))
+ yield entry;
+ }
+ }
+}
+
+const patterns_tree = {
+ make: () => {},
+ register: patterns_tree_register,
+ search: patterns_tree_search
+}
+
+/*
+ * EXPORTS_START
+ * EXPORT patterns_tree
+ * EXPORTS_END
+ */