From 5c583de820c0d5f666a830ca1e8205fe7d55e61e Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Wed, 1 Dec 2021 21:08:03 +0100 Subject: start implementing more efficient querying of URL patterns --- common/patterns.js | 29 ++-- common/patterns_query_tree.js | 271 ++++++++++++++++++++++++++++++++ test/unit/conftest.py | 2 +- test/unit/test_patterns.py | 62 ++++++++ test/unit/test_patterns_query_tree.py | 283 ++++++++++++++++++++++++++++++++++ 5 files changed, 635 insertions(+), 12 deletions(-) create mode 100644 common/patterns_query_tree.js create mode 100644 test/unit/test_patterns_query_tree.py diff --git a/common/patterns.js b/common/patterns.js index 635b128..054e610 100644 --- a/common/patterns.js +++ b/common/patterns.js @@ -17,16 +17,25 @@ const MAX = { const proto_regex = /^(\w+):\/\/(.*)$/; const user_re = "[^/?#@]+@" -const domain_re = "[.a-zA-Z0-9-]+"; +const domain_re = "[.*a-zA-Z0-9-]+"; const path_re = "[^?#]*"; const query_re = "\\??[^#]*"; const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`); -const file_regex = new RegExp(`^(${path_re}).*`); +const file_regex = new RegExp(`^(/${path_re}).*`); const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`); +function match_or_throw(regex, string, error_msg) +{ + const match = regex.exec(string); + if (match === null) + throw error_msg; + + return match; +} + function deconstruct_url(url, use_limits=true) { const max = MAX; @@ -35,21 +44,19 @@ function deconstruct_url(url, use_limits=true) max[key] = Infinity; } - const proto_match = proto_regex.exec(url); - if (proto_match === null) - throw `bad url '${url}'`; + const matcher = (re, str) => match_or_throw(re, str, `bad url '${url}'`) + const proto_match = matcher(proto_regex, url); const deco = {proto: proto_match[1]}; if (deco.proto === "file") { - deco.path = file_regex.exec(proto_match[2])[1]; + deco.path = matcher(file_regex, proto_match[2])[1]; } else if (deco.proto === "ftp") { - [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); + [deco.domain, deco.path] = + matcher(ftp_regex, proto_match[2]).slice(2, 4); } else if (deco.proto === "http" || deco.proto === "https") { - const http_match = http_regex.exec(proto_match[2]); - if (!http_match) - return undefined; - [deco.domain, deco.path, deco.query] = http_match.slice(1, 4); + [deco.domain, deco.path, deco.query] = + matcher(http_regex, proto_match[2]).slice(1, 4); deco.domain = deco.domain.toLowerCase(); } else { throw `unsupported protocol in url '${url}'`; diff --git a/common/patterns_query_tree.js b/common/patterns_query_tree.js new file mode 100644 index 0000000..326c7b7 --- /dev/null +++ b/common/patterns_query_tree.js @@ -0,0 +1,271 @@ +/** + * This file is part of Haketilo. + * + * Function: Data structure to query items by URL patterns. + * + * Copyright (C) 2021 Wojtek Kosior + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * As additional permission under GNU GPL version 3 section 7, you + * may distribute forms of that code without the copy of the GNU + * GPL normally required by section 4, provided you include this + * license notice and, in case of non-source distribution, a URL + * through which recipients can access the Corresponding Source. + * If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not + * obligated to do so. If you do not wish to do so, delete this + * exception statement from your version. + * + * As a special exception to the GPL, any HTML file which merely + * makes function calls to this code, and for that purpose + * includes it by reference shall be deemed a separate work for + * copyright law purposes. If you modify this code, you may extend + * this exception to your version of the code, but you are not + * obligated to do so. If you do not wish to do so, delete this + * exception statement from your version. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * I, Wojtek Kosior, thereby promise not to sue for violation of this file's + * license. Although I request that you do not make use this code in a + * proprietary program, I am not going to enforce this in court. + */ + +/* + * IMPORTS_START + * IMPORT deconstruct_url + * IMPORTS_END + */ + +/* + * This is a javascript rewrite of Python implementation of the algorithm here: + * https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205 + */ + +/* "Pattern Tree" is how we refer to the data structure used for querying + * Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal + * is to make it possible for given URL to quickly retrieve all known patterns + * that match it. + */ +function make_tree_node() { + return { + wildcard_matches: [null, null, null], + literal_match: null, + children: {} + }; +} + +function is_empty_node(tree_node) { + const children = tree_node.children; + for (const key in children) { + if (children.hasOwnProperty(key)) + return false; + } + + if (Array.reduce(tree_node.wildcard_matches, (a, b) => b && a !== null, 1)) + return false; + + return tree_node.literal_match === null; +} + +const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0; + +/* + * Yields all matches of this segments sequence against the tree that starts at + * this node. Results are produces in order from greatest to lowest pattern + * specificity. + */ +function* search_sequence(tree_node, segments) +{ + const nodes = [tree_node]; + + for (const segment of segments) { + const next_node = nodes[nodes.length - 1].children[segment]; + if (next_node === undefined) + break; + + nodes.push(next_node); + } + + const nsegments = segments.length; + + const conds = [ + /* literal pattern match */ + () => nodes.length == nsegments, + /* wildcard pattern matches */ + () => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*", + () => nodes.length + 1 < nsegments, + () => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***" + ]; + + while (nodes.length) { + const node = nodes.pop(); + const items = [node.literal_match, ...node.wildcard_matches]; + + for (let i = 0; i < 4; i++) { + if (items[i] !== null && conds[i]()) + yield items[i]; + } + } +} + +/* + * Make item queryable through (this branch of) the Pattern Tree or remove its + * path from there. + * + * item_modifier should be a function that accepts 1 argument, the item stored + * in the tree (or `null` if there wasn't any item there), and returns an item + * that should be used in place of the first one. It is also legal for it to + * return the same item modifying it first. If it returns `null`, it means the + * item should be deleted from the Tree. + * + * If there was not yet any item associated with the tree path designated by + * segments and value returned by item_modifier is not `null`, make the value + * queryable by this path. + */ +function modify_sequence(tree_node, segments, item_modifier) +{ + const nodes = [tree_node]; + let removed = true; + + for (var current_segment of segments) { + wildcards = tree_node.wildcard_matches; + + const child = tree_node.children[current_segment] || make_tree_node(); + tree_node.children[current_segment] = child; + tree_node = child; + nodes.push(tree_node); + } + + tree_node.literal_match = item_modifier(tree_node.literal_match); + if (tree_node.literal_match !== null) + removed = false; + + let i = segments.length; + + if (is_wildcard(current_segment)) { + const asterisks = current_segment.length - 1; + const wildcards = nodes[i - 1].wildcard_matches; + wildcards[asterisks] = item_modifier(wildcards[asterisks]); + if (wildcards[asterisks] !== null) + removed = false; + } + + while (!removed) + return; + + while (i > 0) { + tree_node = nodes[i--]; + if (is_empty_node(tree_node)) + delete nodes[i].children[segments[i]]; + } +} + +/* + * Make item queryable through the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. + */ +function pattern_tree_register(patterns_by_proto, pattern, item_name, item) +{ + /* + * We pass 'false' to disable length limits on URL parts. Length limits are + * mostly useful in case of iteration over all patterns matching given URL. + * Here we don't do that. + */ + const deco = deconstruct_url(pattern, false); + + const tree = patterns_by_proto[deco.proto] || make_tree_node(); + patterns_by_proto[deco.proto] = tree; + + let path_trees; + + if (deco.proto === "file") { + path_trees = [tree]; + } else { + /* We need an aray of domain labels ordered most-significant-first. */ + const domain = [...deco.domain].reverse(); + path_trees = add_sequence(tree, domain, make_tree_node); + } + + for (const path_tree of path_trees) { + for (const match_obj of add_sequence(path_tree, deco.path, () => {})) + match_obj[item_name] = item; + } +} + +// /* +// * Remove registered item from the Pattern Tree that starts with the protocols +// * dictionary object passed in the first argument. The remaining 2 arguments +// * should be pattern and name that have been earlier passed to +// * pattern_tree_register(). +// */ +// function pattern_tree_deregister(patterns_by_proto, pattern, item_name) +// { +// const deco = deconstruct_url(pattern, false); + +// const tree = patterns_by_proto[deco.proto] || make_tree_node(); +// patterns_by_proto[deco.proto] = tree; + +// let path_trees; + +// if (deco.proto === "file") { +// path_trees = [tree]; +// } else { +// /* We need an aray of domain labels ordered most-significant-first. */ +// const domain = [...deco.domain].reverse(); +// .......................... +// path_trees = add_sequence(tree, domain, make_tree_node); +// } +// } + +/* + * Yield registered items (as [item_name, item] arrays) that match url. For + * cosistent behavior, please don't modify the pattern tree while iterating over + * the results. + */ +function* pattern_tree_search(patterns_by_proto, url) +{ + const deco = deconstruct_url(url, false); + + tree = patterns_by_proto[deco.proto] || make_tree_node(); + + let path_trees; + + if (deco.proto === "file") { + path_trees = [tree]; + } else { + /* We need an aray of domain labels ordered most-significant-first. */ + const domain = [...deco.domain].reverse(); + path_trees = search_sequence(tree, domain); + } + + for (const path_tree of path_trees) { + for (const match_obj of search_sequence(path_tree, deco.path)) { + for (const entry of Object.entries(match_obj)) + yield entry; + } + } +} + +const pattern_tree = { + make: () => {}, + register: pattern_tree_register, + // deregister: pattern_tree_deregister, + search: pattern_tree_search +} + +/* + * EXPORTS_START + * EXPORT pattern_tree + * EXPORTS_END + */ diff --git a/test/unit/conftest.py b/test/unit/conftest.py index 6877b7a..62cc1a0 100644 --- a/test/unit/conftest.py +++ b/test/unit/conftest.py @@ -76,7 +76,7 @@ try { return window.haketilo_selenium_return_value; ''' -def _execute_in_page_context(driver, script, *args): +def _execute_in_page_context(driver, script, args): script = script + '\n;\nwindow.haketilo_selenium_exception = false;' try: return driver.execute_script(script_injecting_script, script, args) diff --git a/test/unit/test_patterns.py b/test/unit/test_patterns.py index 4162fc0..4cfc10c 100644 --- a/test/unit/test_patterns.py +++ b/test/unit/test_patterns.py @@ -89,3 +89,65 @@ def test_regexes(execute_in_page, patterns_code): match = execute_in_page('returnval(ftp_regex.exec(arguments[0]));', '@bad.url/') assert match is None + +def test_deconstruct_url(execute_in_page, patterns_code): + """ + patterns.js contains deconstruct_url() function that handles URL parsing. + Verify it works properly. + """ + execute_in_page(patterns_code, page='https://gotmyowndoma.in') + + deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', + 'https://eXaMpLe.com/a/b?ver=1.2.3#heading2') + assert deco + assert deco['trailing_dash'] == False + assert deco['proto'] == 'https' + assert deco['domain'] == ['example', 'com'] + assert deco['path'] == ['a', 'b'] + + deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', + 'http://**.example.com/') + assert deco + assert deco['trailing_dash'] == True + assert deco['proto'] == 'http' + assert deco['domain'] == ['**', 'example', 'com'] + assert deco['path'] == [] + + deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', + 'ftp://user@ftp.example.com/all///passwords.txt/') + assert deco + assert deco['trailing_dash'] == True + assert deco['proto'] == 'ftp' + assert deco['domain'] == ['ftp', 'example', 'com'] + assert deco['path'] == ['all', 'passwords.txt'] + + deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', + 'ftp://mirror.edu.pl.eu.org') + assert deco + assert deco['trailing_dash'] == False + assert deco['proto'] == 'ftp' + assert deco['domain'] == ['mirror', 'edu', 'pl', 'eu', 'org'] + assert deco['path'] == [] + + deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', + 'file:///mnt/parabola_chroot///etc/passwd') + assert deco + assert deco['trailing_dash'] == False + assert deco['proto'] == 'file' + assert deco['path'] == ['mnt', 'parabola_chroot', 'etc', 'passwd'] + + for bad_url in [ + '://bad-url.missing/protocol', + 'http:/example.com/a/b', + 'unknown://example.com/a/b', + 'idontfancypineapple', + 'ftp://@example.org/', + 'https:///some/path/', + 'file://non-absolute/path' + ]: + with pytest.raises(Exception, match=r'Error in injected script'): + deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', + bad_url) + + # at some point we might also consider testing url deconstruction with + # length limits... diff --git a/test/unit/test_patterns_query_tree.py b/test/unit/test_patterns_query_tree.py new file mode 100644 index 0000000..9fbc0c3 --- /dev/null +++ b/test/unit/test_patterns_query_tree.py @@ -0,0 +1,283 @@ +# SPDX-License-Identifier: CC0-1.0 + +""" +Haketilo unit tests - URL patterns +""" + +# This file is part of Haketilo +# +# Copyright (C) 2021, Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the CC0 1.0 Universal License as published by +# the Creative Commons Corporation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# CC0 1.0 Universal License for more details. + +import pytest + +from ..script_loader import load_script + +@pytest.fixture(scope="session") +def patterns_tree_code(): + yield load_script('common/patterns_query_tree.js', ['common']) + +def test_modify_branch(execute_in_page, patterns_tree_code): + """ + patterns_query_tree.js contains Patterns Tree data structure that allows + arrays of string labels to be mapped to items. + Verify operations modifying a single branch of such tree work properly. + """ + execute_in_page(patterns_tree_code, page='https://gotmyowndoma.in') + execute_in_page( + ''' + let items_added; + let items_removed; + + function _item_adder(item, array) + { + items_added++; + return [...(array || []), item]; + } + + function item_adder(item) + { + items_added = 0; + return array => _item_adder(item, array); + } + + function _item_remover(array) + { + if (array !== null) { + items_removed++; + array.pop(); + } + return (array && array.length > 0) ? array : null; + } + + function item_remover() + { + items_removed = 0; + return _item_remover; + }''') + + # Let's construct some tree branch while checking that each addition gives + # the right result. + branch = execute_in_page( + '''{ + const branch = make_tree_node(); + modify_sequence(branch, ['com', 'example'], item_adder('some_item')); + returnval(branch); + }''') + assert branch == { + 'literal_match': None, + 'wildcard_matches': [None, None, None], + 'children': { + 'com': { + 'literal_match': None, + 'wildcard_matches': [None, None, None], + 'children': { + 'example': { + 'literal_match': ['some_item'], + 'wildcard_matches': [None, None, None], + 'children': { + } + } + } + } + } + } + + branch, items_added = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['com', 'example'], item_adder('other_item')); + returnval([branch, items_added]); + }''', branch) + assert items_added == 1 + assert branch['children']['com']['children']['example']['literal_match'] \ + == ['some_item', 'other_item'] + + for i in range(3): + for expected_array in [['third_item'], ['third_item', '4th_item']]: + wildcard = '*' * (i + 1) + branch, items_added = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['com', 'sample', arguments[1]], + item_adder(arguments[2])); + returnval([branch, items_added]); + }''', + branch, wildcard, expected_array[-1]) + assert items_added == 2 + sample = branch['children']['com']['children']['sample'] + assert sample['wildcard_matches'][i] == expected_array + assert sample['children'][wildcard]['literal_match'] \ + == expected_array + + branch, items_added = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['org', 'koszko', '***', '123'], + item_adder('5th_item')); + returnval([branch, items_added]); + }''', + branch) + assert items_added == 1 + assert branch['children']['org']['children']['koszko']['children']['***']\ + ['children']['123']['literal_match'] == ['5th_item'] + + # Let's verify that removing a nonexistent element doesn't modify the tree. + branch2, items_removed = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['com', 'not', 'registered', '*'], + item_remover()); + returnval([branch, items_removed]); + }''', + branch) + assert branch == branch2 + assert items_removed == 0 + + # Let's remove all elements in the tree branch while checking that each + # removal gives the right result. + branch, items_removed = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['org', 'koszko', '***', '123'], + item_remover()); + returnval([branch, items_removed]); + }''', + branch) + assert items_removed == 1 + assert 'org' not in branch['children'] + + for i in range(3): + for expected_array in [['third_item'], None]: + wildcard = '*' * (i + 1) + branch, items_removed = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['com', 'sample', arguments[1]], + item_remover()); + returnval([branch, items_removed]); + }''', + branch, wildcard) + assert items_removed == 2 + if i == 2 and expected_array == []: + break + sample = branch['children']['com']['children'].get('sample', {}) + assert sample.get('wildcard_matches', [None, None, None])[i] \ + == expected_array + assert sample.get('children', {}).get(wildcard, {})\ + .get('literal_match') == expected_array + + for i in range(2): + branch, items_removed = execute_in_page( + '''{ + const branch = arguments[0]; + modify_sequence(branch, ['com', 'example'], item_remover()); + returnval([branch, items_removed]); + }''', + branch) + assert items_removed == 1 + if i == 0: + assert branch['children']['com']['children']['example']\ + ['literal_match'] == ['some_item'] + else: + assert branch == { + 'literal_match': None, + 'wildcard_matches': [None, None, None], + 'children': { + } + } + +def test_search_branch(execute_in_page, patterns_tree_code): + """ + patterns_query_tree.js contains Patterns Tree data structure that allows + arrays of string labels to be mapped to items. + Verify searching a single branch of such tree work properly. + """ + execute_in_page(patterns_tree_code, page='https://gotmyowndoma.in') + execute_in_page( + ''' + const item_adder = item => (array => [...(array || []), item]); + ''') + + # Let's construct some tree branch to test on. + execute_in_page( + ''' + var branch = make_tree_node(); + + for (const [item, sequence] of [ + ['(root)', []], + ['***', ['***']], + ['**', ['**']], + ['*', ['*']], + + ['a', ['a']], + ['A', ['a']], + ['b', ['b']], + + ['a/***', ['a', '***']], + ['A/***', ['a', '***']], + ['a/**', ['a', '**']], + ['A/**', ['a', '**']], + ['a/*', ['a', '*']], + ['A/*', ['a', '*']], + ['a/sth', ['a', 'sth']], + ['A/sth', ['a', 'sth']], + + ['b/***', ['b', '***']], + ['b/**', ['b', '**']], + ['b/*', ['b', '*']], + ['b/sth', ['b', 'sth']], + ]) + modify_sequence(branch, sequence, item_adder(item)); + ''') + + # Let's make the actual searches on our testing branch. + for sequence, expected in [ + ([], [{'(root)'}, {'***'}]), + (['a'], [{'a', 'A'}, {'a/***', 'A/***'}, {'*'}, {'***'}]), + (['b'], [{'b'}, {'b/***'}, {'*'}, {'***'}]), + (['c'], [ {'*'}, {'***'}]), + (['***'], [{'***'}, {'*'} ]), + (['**'], [{'**'}, {'*'}, {'***'}]), + (['**'], [{'**'}, {'*'}, {'***'}]), + (['*'], [{'*'}, {'***'}]), + + (['a', 'sth'], [{'a/sth', 'A/sth'}, {'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]), + (['b', 'sth'], [{'b/sth'}, {'b/*'}, {'b/***'}, {'**'}, {'***'}]), + (['a', 'hts'], [ {'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]), + (['b', 'hts'], [ {'b/*'}, {'b/***'}, {'**'}, {'***'}]), + (['a', '***'], [{'a/***', 'A/***'}, {'a/*', 'A/*'}, {'**'}, {'***'}]), + (['b', '***'], [{'b/***'}, {'b/*'}, {'**'}, {'***'}]), + (['a', '**'], [{'a/**', 'A/**'}, {'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]), + (['b', '**'], [{'b/**'}, {'b/*'}, {'b/***'}, {'**'}, {'***'}]), + (['a', '*'], [{'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]), + (['b', '*'], [{'b/*'}, {'b/***'}, {'**'}, {'***'}]), + + (['a', 'c', 'd'], [{'a/**', 'A/**'}, {'a/***', 'A/***'}, {'**'}, {'***'}]), + (['b', 'c', 'd'], [{'b/**'}, {'b/***'}, {'**'}, {'***'}]) + ]: + result = execute_in_page( + ''' + returnval([...search_sequence(branch, arguments[0])]); + ''', + sequence) + + try: + assert len(result) == len(expected) + + for expected_set, result_array in zip(expected, result): + assert len(expected_set) == len(result_array) + assert expected_set == set(result_array) + except Exception as e: + import sys + print('sequence:', sequence, '\nexpected:', expected, + '\nresult:', result, file=sys.stderr) + raise e from None -- cgit v1.2.3