diff options
-rw-r--r-- | common/patterns.js | 4 | ||||
-rw-r--r-- | common/patterns_query_tree.js | 165 | ||||
-rwxr-xr-x | test/profiles.py | 39 | ||||
-rw-r--r-- | test/unit/conftest.py | 12 | ||||
-rw-r--r-- | test/unit/test_patterns.py | 39 | ||||
-rw-r--r-- | test/unit/test_patterns_query_tree.py | 200 |
6 files changed, 357 insertions, 102 deletions
diff --git a/common/patterns.js b/common/patterns.js index 5d84a73..7d28dfe 100644 --- a/common/patterns.js +++ b/common/patterns.js @@ -96,7 +96,7 @@ function deconstruct_url(url, use_limits=true) throw `unsupported protocol in url '${url}'`; } - deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; + deco.trailing_slash = deco.path[deco.path.length - 1] === "/"; if (deco.domain) { if (deco.domain.length > max.DOMAIN_CHARS) { @@ -151,7 +151,7 @@ function* each_path_pattern(deco) const path_part = ["", ...deco.path.slice(0, slice)].join("/"); const path_wildcards = []; if (slice === deco.path.length && !deco.path_truncated) { - if (deco.trailing_dash) + if (deco.trailing_slash) yield path_part + "/"; if (slice > 0 || deco.proto !== "file") yield path_part; diff --git a/common/patterns_query_tree.js b/common/patterns_query_tree.js index 326c7b7..49205c5 100644 --- a/common/patterns_query_tree.js +++ b/common/patterns_query_tree.js @@ -47,17 +47,12 @@ * IMPORTS_END */ -/* - * This is a javascript rewrite of Python implementation of the algorithm here: - * https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205 - */ - /* "Pattern Tree" is how we refer to the data structure used for querying * Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal * is to make it possible for given URL to quickly retrieve all known patterns * that match it. */ -function make_tree_node() { +function empty_node() { return { wildcard_matches: [null, null, null], literal_match: null, @@ -141,7 +136,7 @@ function modify_sequence(tree_node, segments, item_modifier) for (var current_segment of segments) { wildcards = tree_node.wildcard_matches; - const child = tree_node.children[current_segment] || make_tree_node(); + const child = tree_node.children[current_segment] || empty_node(); tree_node.children[current_segment] = child; tree_node = child; nodes.push(tree_node); @@ -171,11 +166,26 @@ function modify_sequence(tree_node, segments, item_modifier) } } -/* - * Make item queryable through the Pattern Tree that starts with the protocols - * dictionary object passed in the first argument. - */ -function pattern_tree_register(patterns_by_proto, pattern, item_name, item) +/* Helper function for modify_tree(). */ +function modify_path(tree_node, deco, item_modifier) +{ + tree_node = tree_node || empty_node(); + modify_sequence(tree_node, deco.path, item_modifier); + return is_empty_node(tree_node) ? null : tree_node; +} + +/* Helper function for modify_tree(). */ +function modify_domain(tree_node, deco, item_modifier) +{ + const path_modifier = branch => modify_path(branch, deco, item_modifier); + tree_node = tree_node || empty_node(); + /* We need an array of domain labels ordered most-significant-first. */ + modify_sequence(tree_node, [...deco.domain].reverse(), path_modifier); + return is_empty_node(tree_node) ? null : tree_node; +} + +/* Helper function for pattern_tree_register() and pattern_tree_deregister(). */ +function modify_tree(patterns_by_proto, pattern, item_modifier) { /* * We pass 'false' to disable length limits on URL parts. Length limits are @@ -184,84 +194,99 @@ function pattern_tree_register(patterns_by_proto, pattern, item_name, item) */ const deco = deconstruct_url(pattern, false); - const tree = patterns_by_proto[deco.proto] || make_tree_node(); - patterns_by_proto[deco.proto] = tree; + let tree_for_proto = patterns_by_proto[deco.proto]; - let path_trees; + tree_for_proto = deco.domain === undefined ? + modify_path(tree_for_proto, deco, item_modifier) : + modify_domain(tree_for_proto, deco, item_modifier); - if (deco.proto === "file") { - path_trees = [tree]; - } else { - /* We need an aray of domain labels ordered most-significant-first. */ - const domain = [...deco.domain].reverse(); - path_trees = add_sequence(tree, domain, make_tree_node); - } + patterns_by_proto[deco.proto] = tree_for_proto; + if (tree_for_proto === null) + delete patterns_by_proto[deco.proto]; +} - for (const path_tree of path_trees) { - for (const match_obj of add_sequence(path_tree, deco.path, () => {})) - match_obj[item_name] = item; - } +/* + * Make item queryable through the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. + */ +function pattern_tree_register(patterns_by_proto, pattern, item_name, item) +{ + const key_prefix = pattern[pattern.length - 1] === '/' ? '/' : '_'; + item_name = key_prefix + item_name; + const add_item = obj => Object.assign(obj || {}, {[item_name]: item}); + modify_tree(patterns_by_proto, pattern, add_item); +} + +/* Helper function for pattern_tree_deregister(). */ +function _remove_item(obj, item_name) +{ + obj = obj || {}; + delete obj[item_name]; + for (const key in obj) + return obj; + return null; } -// /* -// * Remove registered item from the Pattern Tree that starts with the protocols -// * dictionary object passed in the first argument. The remaining 2 arguments -// * should be pattern and name that have been earlier passed to -// * pattern_tree_register(). -// */ -// function pattern_tree_deregister(patterns_by_proto, pattern, item_name) -// { -// const deco = deconstruct_url(pattern, false); - -// const tree = patterns_by_proto[deco.proto] || make_tree_node(); -// patterns_by_proto[deco.proto] = tree; - -// let path_trees; - -// if (deco.proto === "file") { -// path_trees = [tree]; -// } else { -// /* We need an aray of domain labels ordered most-significant-first. */ -// const domain = [...deco.domain].reverse(); -// .......................... -// path_trees = add_sequence(tree, domain, make_tree_node); -// } -// } +/* + * Remove registered item from the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. The remaining 2 arguments + * should be pattern and name that have been earlier passed to + * pattern_tree_register(). + */ +function pattern_tree_deregister(patterns_by_proto, pattern, item_name) +{ + const key_prefix = pattern[pattern.length - 1] === '/' ? '/' : '_'; + item_name = key_prefix + item_name; + const remove_item = obj => _remove_item(obj, item_name); + modify_tree(patterns_by_proto, pattern, remove_item); +} /* - * Yield registered items (as [item_name, item] arrays) that match url. For - * cosistent behavior, please don't modify the pattern tree while iterating over - * the results. + * Yield registered items that match url. Each yielded value is an object with + * key being matched item names and values being the items. One such object + * shall contain all items matched with given pattern specificity. Objects are + * yielded in order from greatest to lowest pattern specificity. */ function* pattern_tree_search(patterns_by_proto, url) { const deco = deconstruct_url(url, false); - tree = patterns_by_proto[deco.proto] || make_tree_node(); + const tree_for_proto = patterns_by_proto[deco.proto] || empty_node(); + let by_path = [tree_for_proto]; - let path_trees; - - if (deco.proto === "file") { - path_trees = [tree]; - } else { - /* We need an aray of domain labels ordered most-significant-first. */ - const domain = [...deco.domain].reverse(); - path_trees = search_sequence(tree, domain); - } + /* We need an array of domain labels ordered most-significant-first. */ + if (deco.domain !== undefined) + by_path = search_sequence(tree_for_proto, [...deco.domain].reverse()); - for (const path_tree of path_trees) { + for (const path_tree of by_path) { for (const match_obj of search_sequence(path_tree, deco.path)) { - for (const entry of Object.entries(match_obj)) - yield entry; + let result_obj_slash = null; + let result_obj_no_slash = null; + + for (const [key, item] of Object.entries(match_obj)) { + if (deco.trailing_slash && key[0] === '/') { + result_obj_slash = result_obj_slash || {}; + result_obj_slash[key.substring(1)] = item; + } else if (key[0] !== '/') { + result_obj_no_slash = result_obj_no_slash || {}; + result_obj_no_slash[key.substring(1)] = item; + } + } + + if (deco.trailing_slash && result_obj_slash) + yield result_obj_slash; + + if (result_obj_no_slash) + yield result_obj_no_slash; } } } const pattern_tree = { - make: () => {}, - register: pattern_tree_register, - // deregister: pattern_tree_deregister, - search: pattern_tree_search + make: () => ({}), + register: pattern_tree_register, + deregister: pattern_tree_deregister, + search: pattern_tree_search } /* diff --git a/test/profiles.py b/test/profiles.py index d6a4efc..1530aea 100755 --- a/test/profiles.py +++ b/test/profiles.py @@ -31,7 +31,28 @@ import time from .misc_constants import * +class HaketiloFirefox(webdriver.Firefox): + """ + This wrapper class around selenium.webdriver.Firefox adds a `loaded_scripts` + instance property that gets resetted to an empty array every time the + `get()` method is called. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.reset_loaded_scripts() + + def reset_loaded_scripts(self): + self.loaded_scripts = [] + + def get(self, *args, **kwargs): + self.reset_loaded_scripts() + super().get(*args, **kwargs) + def set_profile_proxy(profile, proxy_host, proxy_port): + """ + Create a Firefox profile that uses the specified HTTP proxy for all + protocols. + """ # proxy type 1 designates "manual" profile.set_preference('network.proxy.type', 1) profile.set_preference('network.proxy.no_proxies_on', '') @@ -49,6 +70,10 @@ def set_profile_console_logging(profile): def firefox_safe_mode(firefox_binary=default_firefox_binary, proxy_host=default_proxy_host, proxy_port=default_proxy_port): + """ + Initialize a Firefox instance controlled by selenium. The instance is + started in safe mode. + """ profile = webdriver.FirefoxProfile() set_profile_proxy(profile, proxy_host, proxy_port) set_profile_console_logging(profile) @@ -56,16 +81,22 @@ def firefox_safe_mode(firefox_binary=default_firefox_binary, options = Options() options.add_argument('--safe-mode') - return webdriver.Firefox(options=options, firefox_profile=profile, - firefox_binary=firefox_binary) + return HaketiloFirefox(options=options, firefox_profile=profile, + firefox_binary=firefox_binary) def firefox_with_profile(firefox_binary=default_firefox_binary, profile_dir=default_clean_profile_dir, proxy_host=default_proxy_host, proxy_port=default_proxy_port): + """ + Initialize a Firefox instance controlled by selenium. The instance is + started using an empty profile (either the default one or the one passed to + `configure` script). The empty profile is meant to make Firefox start with + globally-installed extensions disabled. + """ profile = webdriver.FirefoxProfile(profile_dir) set_profile_proxy(profile, proxy_host, proxy_port) set_profile_console_logging(profile) - return webdriver.Firefox(firefox_profile=profile, - firefox_binary=firefox_binary) + return HaketiloFirefox(firefox_profile=profile, + firefox_binary=firefox_binary) diff --git a/test/unit/conftest.py b/test/unit/conftest.py index 62cc1a0..1500006 100644 --- a/test/unit/conftest.py +++ b/test/unit/conftest.py @@ -78,13 +78,19 @@ return window.haketilo_selenium_return_value; def _execute_in_page_context(driver, script, args): script = script + '\n;\nwindow.haketilo_selenium_exception = false;' + driver.loaded_scripts.append(script) try: return driver.execute_script(script_injecting_script, script, args) except Exception as e: import sys - lines = enumerate(script.split('\n'), 1) - for err_info in [('Failing script\n',), *lines]: - print(*err_info, file=sys.stderr) + + print("Scripts loaded since driver's last get() method call:", + file=sys.stderr) + + for script in driver.loaded_scripts: + lines = enumerate(script.split('\n'), 1) + for err_info in [('===',), *lines]: + print(*err_info, file=sys.stderr) raise e from None diff --git a/test/unit/test_patterns.py b/test/unit/test_patterns.py index 4cfc10c..802bf4e 100644 --- a/test/unit/test_patterns.py +++ b/test/unit/test_patterns.py @@ -100,41 +100,42 @@ def test_deconstruct_url(execute_in_page, patterns_code): deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'https://eXaMpLe.com/a/b?ver=1.2.3#heading2') assert deco - assert deco['trailing_dash'] == False - assert deco['proto'] == 'https' - assert deco['domain'] == ['example', 'com'] - assert deco['path'] == ['a', 'b'] + assert deco['trailing_slash'] == False + assert deco['proto'] == 'https' + assert deco['domain'] == ['example', 'com'] + assert deco['path'] == ['a', 'b'] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'http://**.example.com/') assert deco - assert deco['trailing_dash'] == True - assert deco['proto'] == 'http' - assert deco['domain'] == ['**', 'example', 'com'] - assert deco['path'] == [] + assert deco['trailing_slash'] == True + assert deco['proto'] == 'http' + assert deco['domain'] == ['**', 'example', 'com'] + assert deco['path'] == [] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'ftp://user@ftp.example.com/all///passwords.txt/') assert deco - assert deco['trailing_dash'] == True - assert deco['proto'] == 'ftp' - assert deco['domain'] == ['ftp', 'example', 'com'] - assert deco['path'] == ['all', 'passwords.txt'] + assert deco['trailing_slash'] == True + assert deco['proto'] == 'ftp' + assert deco['domain'] == ['ftp', 'example', 'com'] + assert deco['path'] == ['all', 'passwords.txt'] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'ftp://mirror.edu.pl.eu.org') assert deco - assert deco['trailing_dash'] == False - assert deco['proto'] == 'ftp' - assert deco['domain'] == ['mirror', 'edu', 'pl', 'eu', 'org'] - assert deco['path'] == [] + assert deco['trailing_slash'] == False + assert deco['proto'] == 'ftp' + assert deco['domain'] == ['mirror', 'edu', 'pl', 'eu', 'org'] + assert deco['path'] == [] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'file:///mnt/parabola_chroot///etc/passwd') assert deco - assert deco['trailing_dash'] == False - assert deco['proto'] == 'file' - assert deco['path'] == ['mnt', 'parabola_chroot', 'etc', 'passwd'] + assert deco['trailing_slash'] == False + assert deco['proto'] == 'file' + assert deco['path'] == ['mnt', 'parabola_chroot', 'etc', 'passwd'] + assert 'domain' not in deco for bad_url in [ '://bad-url.missing/protocol', diff --git a/test/unit/test_patterns_query_tree.py b/test/unit/test_patterns_query_tree.py index 9fbc0c3..e282592 100644 --- a/test/unit/test_patterns_query_tree.py +++ b/test/unit/test_patterns_query_tree.py @@ -27,7 +27,7 @@ def patterns_tree_code(): def test_modify_branch(execute_in_page, patterns_tree_code): """ - patterns_query_tree.js contains Patterns Tree data structure that allows + patterns_query_tree.js contains Pattern Tree data structure that allows arrays of string labels to be mapped to items. Verify operations modifying a single branch of such tree work properly. """ @@ -68,7 +68,7 @@ def test_modify_branch(execute_in_page, patterns_tree_code): # the right result. branch = execute_in_page( '''{ - const branch = make_tree_node(); + const branch = empty_node(); modify_sequence(branch, ['com', 'example'], item_adder('some_item')); returnval(branch); }''') @@ -197,7 +197,7 @@ def test_modify_branch(execute_in_page, patterns_tree_code): def test_search_branch(execute_in_page, patterns_tree_code): """ - patterns_query_tree.js contains Patterns Tree data structure that allows + patterns_query_tree.js contains Pattern Tree data structure that allows arrays of string labels to be mapped to items. Verify searching a single branch of such tree work properly. """ @@ -210,7 +210,7 @@ def test_search_branch(execute_in_page, patterns_tree_code): # Let's construct some tree branch to test on. execute_in_page( ''' - var branch = make_tree_node(); + var branch = empty_node(); for (const [item, sequence] of [ ['(root)', []], @@ -281,3 +281,195 @@ def test_search_branch(execute_in_page, patterns_tree_code): print('sequence:', sequence, '\nexpected:', expected, '\nresult:', result, file=sys.stderr) raise e from None + +def test_pattern_tree(execute_in_page, patterns_tree_code): + """ + patterns_query_tree.js contains Pattern Tree data structure that allows + arrays of string labels to be mapped to items. + Verify operations on entire such tree work properly. + """ + execute_in_page(patterns_tree_code, page='https://gotmyowndoma.in') + + # Perform tests with all possible patterns for a simple URL. + url = 'https://example.com' + patterns = [ + 'https://example.com', + 'https://example.com/***', + 'https://***.example.com', + 'https://***.example.com/***' + ] + bad_patterns = [ + 'http://example.com', + 'https://a.example.com', + 'https://*.example.com', + 'https://**.example.com', + 'https://example.com/a', + 'https://example.com/*', + 'https://example.com/**', + ] + + expected = [{'key': p} for p in patterns] + + tree, result = execute_in_page( + '''{ + const tree = pattern_tree.make(); + for (const pattern of arguments[0].concat(arguments[1])) { + pattern_tree.register(tree, pattern, 'key', pattern); + pattern_tree.register(tree, pattern + '/', 'key', pattern + '/'); + } + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + patterns, bad_patterns, url) + assert expected == result + + # Also verify that deregistering half of the good patterns works correctly. + patterns_removed = [pattern for i, pattern in enumerate(patterns) if i % 2] + patterns = [pattern for i, pattern in enumerate(patterns) if not (i % 2)] + expected = [{'key': p} for p in patterns] + tree, result = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1]) { + pattern_tree.deregister(tree, pattern, 'key'); + pattern_tree.deregister(tree, pattern + '/', 'key'); + } + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + tree, patterns_removed, url) + assert expected == result + + # Also verify that deregistering all the patterns works correctly. + tree = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1].concat(arguments[2])) { + pattern_tree.deregister(tree, pattern, 'key'); + pattern_tree.deregister(tree, pattern + '/', 'key'); + } + returnval(tree); + }''', + tree, patterns, bad_patterns) + assert tree == {} + + # Perform tests with all possible patterns for a complex URL. + url = 'http://settings.query.example.com/google/tries/destroy/adblockers//' + patterns = [ + 'http://settings.query.example.com/google/tries/destroy/adblockers', + 'http://settings.query.example.com/google/tries/destroy/adblockers/***', + 'http://settings.query.example.com/google/tries/destroy/*', + 'http://settings.query.example.com/google/tries/destroy/***', + 'http://settings.query.example.com/google/tries/**', + 'http://settings.query.example.com/google/tries/***', + 'http://settings.query.example.com/google/**', + 'http://settings.query.example.com/google/***', + 'http://settings.query.example.com/**', + 'http://settings.query.example.com/***', + + 'http://***.settings.query.example.com/google/tries/destroy/adblockers', + 'http://***.settings.query.example.com/google/tries/destroy/adblockers/***', + 'http://***.settings.query.example.com/google/tries/destroy/*', + 'http://***.settings.query.example.com/google/tries/destroy/***', + 'http://***.settings.query.example.com/google/tries/**', + 'http://***.settings.query.example.com/google/tries/***', + 'http://***.settings.query.example.com/google/**', + 'http://***.settings.query.example.com/google/***', + 'http://***.settings.query.example.com/**', + 'http://***.settings.query.example.com/***', + 'http://*.query.example.com/google/tries/destroy/adblockers', + 'http://*.query.example.com/google/tries/destroy/adblockers/***', + 'http://*.query.example.com/google/tries/destroy/*', + 'http://*.query.example.com/google/tries/destroy/***', + 'http://*.query.example.com/google/tries/**', + 'http://*.query.example.com/google/tries/***', + 'http://*.query.example.com/google/**', + 'http://*.query.example.com/google/***', + 'http://*.query.example.com/**', + 'http://*.query.example.com/***', + 'http://***.query.example.com/google/tries/destroy/adblockers', + 'http://***.query.example.com/google/tries/destroy/adblockers/***', + 'http://***.query.example.com/google/tries/destroy/*', + 'http://***.query.example.com/google/tries/destroy/***', + 'http://***.query.example.com/google/tries/**', + 'http://***.query.example.com/google/tries/***', + 'http://***.query.example.com/google/**', + 'http://***.query.example.com/google/***', + 'http://***.query.example.com/**', + 'http://***.query.example.com/***', + 'http://**.example.com/google/tries/destroy/adblockers', + 'http://**.example.com/google/tries/destroy/adblockers/***', + 'http://**.example.com/google/tries/destroy/*', + 'http://**.example.com/google/tries/destroy/***', + 'http://**.example.com/google/tries/**', + 'http://**.example.com/google/tries/***', + 'http://**.example.com/google/**', + 'http://**.example.com/google/***', + 'http://**.example.com/**', + 'http://**.example.com/***', + 'http://***.example.com/google/tries/destroy/adblockers', + 'http://***.example.com/google/tries/destroy/adblockers/***', + 'http://***.example.com/google/tries/destroy/*', + 'http://***.example.com/google/tries/destroy/***', + 'http://***.example.com/google/tries/**', + 'http://***.example.com/google/tries/***', + 'http://***.example.com/google/**', + 'http://***.example.com/google/***', + 'http://***.example.com/**', + 'http://***.example.com/***' + ] + bad_patterns = [ + 'https://settings.query.example.com/google/tries/destroy/adblockers', + 'http://settings.query.example.com/google/tries/destroy/adblockers/a', + 'http://settings.query.example.com/google/tries/destroy/adblockers/*', + 'http://settings.query.example.com/google/tries/destroy/adblockers/**', + 'http://settings.query.example.com/google/tries/destroy/a', + 'http://settings.query.example.com/google/tries/destroy/**', + 'http://settings.query.example.com/google/tries/*', + 'http://a.settings.query.example.com/google/tries/destroy/adblockers', + 'http://*.settings.query.example.com/google/tries/destroy/adblockers', + 'http://**.settings.query.example.com/google/tries/destroy/adblockers', + 'http://a.query.example.com/google/tries/destroy/adblockers', + 'http://**.query.example.com/google/tries/destroy/adblockers', + 'http://*.example.com/google/tries/destroy/adblockers' + ] + + expected = [{'key': p + s} for p in patterns for s in ['/', '']] + + tree, result = execute_in_page( + '''{ + const tree = pattern_tree.make(); + for (const pattern of arguments[0].concat(arguments[1])) { + pattern_tree.register(tree, pattern, 'key', pattern); + pattern_tree.register(tree, pattern + '/', 'key', pattern + '/'); + } + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + patterns, bad_patterns, url) + assert expected == result + + # Also verify that deregistering all patterns with trailing slash works + # correctly. + expected = [{'key': p} for p in patterns] + tree, result = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1]) + pattern_tree.deregister(tree, pattern + '/', 'key'); + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + tree, patterns, url) + assert expected == result + + # Also verify that deregistering all the patterns works correctly. + tree = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1]) + pattern_tree.deregister(tree, pattern, 'key'); + for (const pattern of arguments[2]) { + pattern_tree.deregister(tree, pattern, 'key'); + pattern_tree.deregister(tree, pattern + '/', 'key'); + } + returnval(tree); + }''', + tree, patterns, bad_patterns) + assert tree == {} |