From e1282a63d6e41d437dd1b14a08baf89b78ab56cc Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Sat, 4 Dec 2021 19:31:43 +0100 Subject: finish implementing more efficient querying of URL patterns The algorithm is implemented and tested. However, it is yet to be hooked into the actual extension. --- common/patterns.js | 4 +- common/patterns_query_tree.js | 165 ++++++++++++++++++++++++------------------ 2 files changed, 97 insertions(+), 72 deletions(-) (limited to 'common') diff --git a/common/patterns.js b/common/patterns.js index 5d84a73..7d28dfe 100644 --- a/common/patterns.js +++ b/common/patterns.js @@ -96,7 +96,7 @@ function deconstruct_url(url, use_limits=true) throw `unsupported protocol in url '${url}'`; } - deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; + deco.trailing_slash = deco.path[deco.path.length - 1] === "/"; if (deco.domain) { if (deco.domain.length > max.DOMAIN_CHARS) { @@ -151,7 +151,7 @@ function* each_path_pattern(deco) const path_part = ["", ...deco.path.slice(0, slice)].join("/"); const path_wildcards = []; if (slice === deco.path.length && !deco.path_truncated) { - if (deco.trailing_dash) + if (deco.trailing_slash) yield path_part + "/"; if (slice > 0 || deco.proto !== "file") yield path_part; diff --git a/common/patterns_query_tree.js b/common/patterns_query_tree.js index 326c7b7..49205c5 100644 --- a/common/patterns_query_tree.js +++ b/common/patterns_query_tree.js @@ -47,17 +47,12 @@ * IMPORTS_END */ -/* - * This is a javascript rewrite of Python implementation of the algorithm here: - * https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205 - */ - /* "Pattern Tree" is how we refer to the data structure used for querying * Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal * is to make it possible for given URL to quickly retrieve all known patterns * that match it. */ -function make_tree_node() { +function empty_node() { return { wildcard_matches: [null, null, null], literal_match: null, @@ -141,7 +136,7 @@ function modify_sequence(tree_node, segments, item_modifier) for (var current_segment of segments) { wildcards = tree_node.wildcard_matches; - const child = tree_node.children[current_segment] || make_tree_node(); + const child = tree_node.children[current_segment] || empty_node(); tree_node.children[current_segment] = child; tree_node = child; nodes.push(tree_node); @@ -171,11 +166,26 @@ function modify_sequence(tree_node, segments, item_modifier) } } -/* - * Make item queryable through the Pattern Tree that starts with the protocols - * dictionary object passed in the first argument. - */ -function pattern_tree_register(patterns_by_proto, pattern, item_name, item) +/* Helper function for modify_tree(). */ +function modify_path(tree_node, deco, item_modifier) +{ + tree_node = tree_node || empty_node(); + modify_sequence(tree_node, deco.path, item_modifier); + return is_empty_node(tree_node) ? null : tree_node; +} + +/* Helper function for modify_tree(). */ +function modify_domain(tree_node, deco, item_modifier) +{ + const path_modifier = branch => modify_path(branch, deco, item_modifier); + tree_node = tree_node || empty_node(); + /* We need an array of domain labels ordered most-significant-first. */ + modify_sequence(tree_node, [...deco.domain].reverse(), path_modifier); + return is_empty_node(tree_node) ? null : tree_node; +} + +/* Helper function for pattern_tree_register() and pattern_tree_deregister(). */ +function modify_tree(patterns_by_proto, pattern, item_modifier) { /* * We pass 'false' to disable length limits on URL parts. Length limits are @@ -184,84 +194,99 @@ function pattern_tree_register(patterns_by_proto, pattern, item_name, item) */ const deco = deconstruct_url(pattern, false); - const tree = patterns_by_proto[deco.proto] || make_tree_node(); - patterns_by_proto[deco.proto] = tree; + let tree_for_proto = patterns_by_proto[deco.proto]; - let path_trees; + tree_for_proto = deco.domain === undefined ? + modify_path(tree_for_proto, deco, item_modifier) : + modify_domain(tree_for_proto, deco, item_modifier); - if (deco.proto === "file") { - path_trees = [tree]; - } else { - /* We need an aray of domain labels ordered most-significant-first. */ - const domain = [...deco.domain].reverse(); - path_trees = add_sequence(tree, domain, make_tree_node); - } + patterns_by_proto[deco.proto] = tree_for_proto; + if (tree_for_proto === null) + delete patterns_by_proto[deco.proto]; +} - for (const path_tree of path_trees) { - for (const match_obj of add_sequence(path_tree, deco.path, () => {})) - match_obj[item_name] = item; - } +/* + * Make item queryable through the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. + */ +function pattern_tree_register(patterns_by_proto, pattern, item_name, item) +{ + const key_prefix = pattern[pattern.length - 1] === '/' ? '/' : '_'; + item_name = key_prefix + item_name; + const add_item = obj => Object.assign(obj || {}, {[item_name]: item}); + modify_tree(patterns_by_proto, pattern, add_item); +} + +/* Helper function for pattern_tree_deregister(). */ +function _remove_item(obj, item_name) +{ + obj = obj || {}; + delete obj[item_name]; + for (const key in obj) + return obj; + return null; } -// /* -// * Remove registered item from the Pattern Tree that starts with the protocols -// * dictionary object passed in the first argument. The remaining 2 arguments -// * should be pattern and name that have been earlier passed to -// * pattern_tree_register(). -// */ -// function pattern_tree_deregister(patterns_by_proto, pattern, item_name) -// { -// const deco = deconstruct_url(pattern, false); - -// const tree = patterns_by_proto[deco.proto] || make_tree_node(); -// patterns_by_proto[deco.proto] = tree; - -// let path_trees; - -// if (deco.proto === "file") { -// path_trees = [tree]; -// } else { -// /* We need an aray of domain labels ordered most-significant-first. */ -// const domain = [...deco.domain].reverse(); -// .......................... -// path_trees = add_sequence(tree, domain, make_tree_node); -// } -// } +/* + * Remove registered item from the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. The remaining 2 arguments + * should be pattern and name that have been earlier passed to + * pattern_tree_register(). + */ +function pattern_tree_deregister(patterns_by_proto, pattern, item_name) +{ + const key_prefix = pattern[pattern.length - 1] === '/' ? '/' : '_'; + item_name = key_prefix + item_name; + const remove_item = obj => _remove_item(obj, item_name); + modify_tree(patterns_by_proto, pattern, remove_item); +} /* - * Yield registered items (as [item_name, item] arrays) that match url. For - * cosistent behavior, please don't modify the pattern tree while iterating over - * the results. + * Yield registered items that match url. Each yielded value is an object with + * key being matched item names and values being the items. One such object + * shall contain all items matched with given pattern specificity. Objects are + * yielded in order from greatest to lowest pattern specificity. */ function* pattern_tree_search(patterns_by_proto, url) { const deco = deconstruct_url(url, false); - tree = patterns_by_proto[deco.proto] || make_tree_node(); + const tree_for_proto = patterns_by_proto[deco.proto] || empty_node(); + let by_path = [tree_for_proto]; - let path_trees; - - if (deco.proto === "file") { - path_trees = [tree]; - } else { - /* We need an aray of domain labels ordered most-significant-first. */ - const domain = [...deco.domain].reverse(); - path_trees = search_sequence(tree, domain); - } + /* We need an array of domain labels ordered most-significant-first. */ + if (deco.domain !== undefined) + by_path = search_sequence(tree_for_proto, [...deco.domain].reverse()); - for (const path_tree of path_trees) { + for (const path_tree of by_path) { for (const match_obj of search_sequence(path_tree, deco.path)) { - for (const entry of Object.entries(match_obj)) - yield entry; + let result_obj_slash = null; + let result_obj_no_slash = null; + + for (const [key, item] of Object.entries(match_obj)) { + if (deco.trailing_slash && key[0] === '/') { + result_obj_slash = result_obj_slash || {}; + result_obj_slash[key.substring(1)] = item; + } else if (key[0] !== '/') { + result_obj_no_slash = result_obj_no_slash || {}; + result_obj_no_slash[key.substring(1)] = item; + } + } + + if (deco.trailing_slash && result_obj_slash) + yield result_obj_slash; + + if (result_obj_no_slash) + yield result_obj_no_slash; } } } const pattern_tree = { - make: () => {}, - register: pattern_tree_register, - // deregister: pattern_tree_deregister, - search: pattern_tree_search + make: () => ({}), + register: pattern_tree_register, + deregister: pattern_tree_deregister, + search: pattern_tree_search } /* -- cgit v1.2.3