From e1282a63d6e41d437dd1b14a08baf89b78ab56cc Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Sat, 4 Dec 2021 19:31:43 +0100 Subject: finish implementing more efficient querying of URL patterns The algorithm is implemented and tested. However, it is yet to be hooked into the actual extension. --- test/profiles.py | 39 ++++++- test/unit/conftest.py | 12 +- test/unit/test_patterns.py | 39 +++---- test/unit/test_patterns_query_tree.py | 200 +++++++++++++++++++++++++++++++++- 4 files changed, 260 insertions(+), 30 deletions(-) (limited to 'test') diff --git a/test/profiles.py b/test/profiles.py index d6a4efc..1530aea 100755 --- a/test/profiles.py +++ b/test/profiles.py @@ -31,7 +31,28 @@ import time from .misc_constants import * +class HaketiloFirefox(webdriver.Firefox): + """ + This wrapper class around selenium.webdriver.Firefox adds a `loaded_scripts` + instance property that gets resetted to an empty array every time the + `get()` method is called. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.reset_loaded_scripts() + + def reset_loaded_scripts(self): + self.loaded_scripts = [] + + def get(self, *args, **kwargs): + self.reset_loaded_scripts() + super().get(*args, **kwargs) + def set_profile_proxy(profile, proxy_host, proxy_port): + """ + Create a Firefox profile that uses the specified HTTP proxy for all + protocols. + """ # proxy type 1 designates "manual" profile.set_preference('network.proxy.type', 1) profile.set_preference('network.proxy.no_proxies_on', '') @@ -49,6 +70,10 @@ def set_profile_console_logging(profile): def firefox_safe_mode(firefox_binary=default_firefox_binary, proxy_host=default_proxy_host, proxy_port=default_proxy_port): + """ + Initialize a Firefox instance controlled by selenium. The instance is + started in safe mode. + """ profile = webdriver.FirefoxProfile() set_profile_proxy(profile, proxy_host, proxy_port) set_profile_console_logging(profile) @@ -56,16 +81,22 @@ def firefox_safe_mode(firefox_binary=default_firefox_binary, options = Options() options.add_argument('--safe-mode') - return webdriver.Firefox(options=options, firefox_profile=profile, - firefox_binary=firefox_binary) + return HaketiloFirefox(options=options, firefox_profile=profile, + firefox_binary=firefox_binary) def firefox_with_profile(firefox_binary=default_firefox_binary, profile_dir=default_clean_profile_dir, proxy_host=default_proxy_host, proxy_port=default_proxy_port): + """ + Initialize a Firefox instance controlled by selenium. The instance is + started using an empty profile (either the default one or the one passed to + `configure` script). The empty profile is meant to make Firefox start with + globally-installed extensions disabled. + """ profile = webdriver.FirefoxProfile(profile_dir) set_profile_proxy(profile, proxy_host, proxy_port) set_profile_console_logging(profile) - return webdriver.Firefox(firefox_profile=profile, - firefox_binary=firefox_binary) + return HaketiloFirefox(firefox_profile=profile, + firefox_binary=firefox_binary) diff --git a/test/unit/conftest.py b/test/unit/conftest.py index 62cc1a0..1500006 100644 --- a/test/unit/conftest.py +++ b/test/unit/conftest.py @@ -78,13 +78,19 @@ return window.haketilo_selenium_return_value; def _execute_in_page_context(driver, script, args): script = script + '\n;\nwindow.haketilo_selenium_exception = false;' + driver.loaded_scripts.append(script) try: return driver.execute_script(script_injecting_script, script, args) except Exception as e: import sys - lines = enumerate(script.split('\n'), 1) - for err_info in [('Failing script\n',), *lines]: - print(*err_info, file=sys.stderr) + + print("Scripts loaded since driver's last get() method call:", + file=sys.stderr) + + for script in driver.loaded_scripts: + lines = enumerate(script.split('\n'), 1) + for err_info in [('===',), *lines]: + print(*err_info, file=sys.stderr) raise e from None diff --git a/test/unit/test_patterns.py b/test/unit/test_patterns.py index 4cfc10c..802bf4e 100644 --- a/test/unit/test_patterns.py +++ b/test/unit/test_patterns.py @@ -100,41 +100,42 @@ def test_deconstruct_url(execute_in_page, patterns_code): deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'https://eXaMpLe.com/a/b?ver=1.2.3#heading2') assert deco - assert deco['trailing_dash'] == False - assert deco['proto'] == 'https' - assert deco['domain'] == ['example', 'com'] - assert deco['path'] == ['a', 'b'] + assert deco['trailing_slash'] == False + assert deco['proto'] == 'https' + assert deco['domain'] == ['example', 'com'] + assert deco['path'] == ['a', 'b'] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'http://**.example.com/') assert deco - assert deco['trailing_dash'] == True - assert deco['proto'] == 'http' - assert deco['domain'] == ['**', 'example', 'com'] - assert deco['path'] == [] + assert deco['trailing_slash'] == True + assert deco['proto'] == 'http' + assert deco['domain'] == ['**', 'example', 'com'] + assert deco['path'] == [] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'ftp://user@ftp.example.com/all///passwords.txt/') assert deco - assert deco['trailing_dash'] == True - assert deco['proto'] == 'ftp' - assert deco['domain'] == ['ftp', 'example', 'com'] - assert deco['path'] == ['all', 'passwords.txt'] + assert deco['trailing_slash'] == True + assert deco['proto'] == 'ftp' + assert deco['domain'] == ['ftp', 'example', 'com'] + assert deco['path'] == ['all', 'passwords.txt'] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'ftp://mirror.edu.pl.eu.org') assert deco - assert deco['trailing_dash'] == False - assert deco['proto'] == 'ftp' - assert deco['domain'] == ['mirror', 'edu', 'pl', 'eu', 'org'] - assert deco['path'] == [] + assert deco['trailing_slash'] == False + assert deco['proto'] == 'ftp' + assert deco['domain'] == ['mirror', 'edu', 'pl', 'eu', 'org'] + assert deco['path'] == [] deco = execute_in_page('returnval(deconstruct_url(arguments[0]));', 'file:///mnt/parabola_chroot///etc/passwd') assert deco - assert deco['trailing_dash'] == False - assert deco['proto'] == 'file' - assert deco['path'] == ['mnt', 'parabola_chroot', 'etc', 'passwd'] + assert deco['trailing_slash'] == False + assert deco['proto'] == 'file' + assert deco['path'] == ['mnt', 'parabola_chroot', 'etc', 'passwd'] + assert 'domain' not in deco for bad_url in [ '://bad-url.missing/protocol', diff --git a/test/unit/test_patterns_query_tree.py b/test/unit/test_patterns_query_tree.py index 9fbc0c3..e282592 100644 --- a/test/unit/test_patterns_query_tree.py +++ b/test/unit/test_patterns_query_tree.py @@ -27,7 +27,7 @@ def patterns_tree_code(): def test_modify_branch(execute_in_page, patterns_tree_code): """ - patterns_query_tree.js contains Patterns Tree data structure that allows + patterns_query_tree.js contains Pattern Tree data structure that allows arrays of string labels to be mapped to items. Verify operations modifying a single branch of such tree work properly. """ @@ -68,7 +68,7 @@ def test_modify_branch(execute_in_page, patterns_tree_code): # the right result. branch = execute_in_page( '''{ - const branch = make_tree_node(); + const branch = empty_node(); modify_sequence(branch, ['com', 'example'], item_adder('some_item')); returnval(branch); }''') @@ -197,7 +197,7 @@ def test_modify_branch(execute_in_page, patterns_tree_code): def test_search_branch(execute_in_page, patterns_tree_code): """ - patterns_query_tree.js contains Patterns Tree data structure that allows + patterns_query_tree.js contains Pattern Tree data structure that allows arrays of string labels to be mapped to items. Verify searching a single branch of such tree work properly. """ @@ -210,7 +210,7 @@ def test_search_branch(execute_in_page, patterns_tree_code): # Let's construct some tree branch to test on. execute_in_page( ''' - var branch = make_tree_node(); + var branch = empty_node(); for (const [item, sequence] of [ ['(root)', []], @@ -281,3 +281,195 @@ def test_search_branch(execute_in_page, patterns_tree_code): print('sequence:', sequence, '\nexpected:', expected, '\nresult:', result, file=sys.stderr) raise e from None + +def test_pattern_tree(execute_in_page, patterns_tree_code): + """ + patterns_query_tree.js contains Pattern Tree data structure that allows + arrays of string labels to be mapped to items. + Verify operations on entire such tree work properly. + """ + execute_in_page(patterns_tree_code, page='https://gotmyowndoma.in') + + # Perform tests with all possible patterns for a simple URL. + url = 'https://example.com' + patterns = [ + 'https://example.com', + 'https://example.com/***', + 'https://***.example.com', + 'https://***.example.com/***' + ] + bad_patterns = [ + 'http://example.com', + 'https://a.example.com', + 'https://*.example.com', + 'https://**.example.com', + 'https://example.com/a', + 'https://example.com/*', + 'https://example.com/**', + ] + + expected = [{'key': p} for p in patterns] + + tree, result = execute_in_page( + '''{ + const tree = pattern_tree.make(); + for (const pattern of arguments[0].concat(arguments[1])) { + pattern_tree.register(tree, pattern, 'key', pattern); + pattern_tree.register(tree, pattern + '/', 'key', pattern + '/'); + } + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + patterns, bad_patterns, url) + assert expected == result + + # Also verify that deregistering half of the good patterns works correctly. + patterns_removed = [pattern for i, pattern in enumerate(patterns) if i % 2] + patterns = [pattern for i, pattern in enumerate(patterns) if not (i % 2)] + expected = [{'key': p} for p in patterns] + tree, result = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1]) { + pattern_tree.deregister(tree, pattern, 'key'); + pattern_tree.deregister(tree, pattern + '/', 'key'); + } + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + tree, patterns_removed, url) + assert expected == result + + # Also verify that deregistering all the patterns works correctly. + tree = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1].concat(arguments[2])) { + pattern_tree.deregister(tree, pattern, 'key'); + pattern_tree.deregister(tree, pattern + '/', 'key'); + } + returnval(tree); + }''', + tree, patterns, bad_patterns) + assert tree == {} + + # Perform tests with all possible patterns for a complex URL. + url = 'http://settings.query.example.com/google/tries/destroy/adblockers//' + patterns = [ + 'http://settings.query.example.com/google/tries/destroy/adblockers', + 'http://settings.query.example.com/google/tries/destroy/adblockers/***', + 'http://settings.query.example.com/google/tries/destroy/*', + 'http://settings.query.example.com/google/tries/destroy/***', + 'http://settings.query.example.com/google/tries/**', + 'http://settings.query.example.com/google/tries/***', + 'http://settings.query.example.com/google/**', + 'http://settings.query.example.com/google/***', + 'http://settings.query.example.com/**', + 'http://settings.query.example.com/***', + + 'http://***.settings.query.example.com/google/tries/destroy/adblockers', + 'http://***.settings.query.example.com/google/tries/destroy/adblockers/***', + 'http://***.settings.query.example.com/google/tries/destroy/*', + 'http://***.settings.query.example.com/google/tries/destroy/***', + 'http://***.settings.query.example.com/google/tries/**', + 'http://***.settings.query.example.com/google/tries/***', + 'http://***.settings.query.example.com/google/**', + 'http://***.settings.query.example.com/google/***', + 'http://***.settings.query.example.com/**', + 'http://***.settings.query.example.com/***', + 'http://*.query.example.com/google/tries/destroy/adblockers', + 'http://*.query.example.com/google/tries/destroy/adblockers/***', + 'http://*.query.example.com/google/tries/destroy/*', + 'http://*.query.example.com/google/tries/destroy/***', + 'http://*.query.example.com/google/tries/**', + 'http://*.query.example.com/google/tries/***', + 'http://*.query.example.com/google/**', + 'http://*.query.example.com/google/***', + 'http://*.query.example.com/**', + 'http://*.query.example.com/***', + 'http://***.query.example.com/google/tries/destroy/adblockers', + 'http://***.query.example.com/google/tries/destroy/adblockers/***', + 'http://***.query.example.com/google/tries/destroy/*', + 'http://***.query.example.com/google/tries/destroy/***', + 'http://***.query.example.com/google/tries/**', + 'http://***.query.example.com/google/tries/***', + 'http://***.query.example.com/google/**', + 'http://***.query.example.com/google/***', + 'http://***.query.example.com/**', + 'http://***.query.example.com/***', + 'http://**.example.com/google/tries/destroy/adblockers', + 'http://**.example.com/google/tries/destroy/adblockers/***', + 'http://**.example.com/google/tries/destroy/*', + 'http://**.example.com/google/tries/destroy/***', + 'http://**.example.com/google/tries/**', + 'http://**.example.com/google/tries/***', + 'http://**.example.com/google/**', + 'http://**.example.com/google/***', + 'http://**.example.com/**', + 'http://**.example.com/***', + 'http://***.example.com/google/tries/destroy/adblockers', + 'http://***.example.com/google/tries/destroy/adblockers/***', + 'http://***.example.com/google/tries/destroy/*', + 'http://***.example.com/google/tries/destroy/***', + 'http://***.example.com/google/tries/**', + 'http://***.example.com/google/tries/***', + 'http://***.example.com/google/**', + 'http://***.example.com/google/***', + 'http://***.example.com/**', + 'http://***.example.com/***' + ] + bad_patterns = [ + 'https://settings.query.example.com/google/tries/destroy/adblockers', + 'http://settings.query.example.com/google/tries/destroy/adblockers/a', + 'http://settings.query.example.com/google/tries/destroy/adblockers/*', + 'http://settings.query.example.com/google/tries/destroy/adblockers/**', + 'http://settings.query.example.com/google/tries/destroy/a', + 'http://settings.query.example.com/google/tries/destroy/**', + 'http://settings.query.example.com/google/tries/*', + 'http://a.settings.query.example.com/google/tries/destroy/adblockers', + 'http://*.settings.query.example.com/google/tries/destroy/adblockers', + 'http://**.settings.query.example.com/google/tries/destroy/adblockers', + 'http://a.query.example.com/google/tries/destroy/adblockers', + 'http://**.query.example.com/google/tries/destroy/adblockers', + 'http://*.example.com/google/tries/destroy/adblockers' + ] + + expected = [{'key': p + s} for p in patterns for s in ['/', '']] + + tree, result = execute_in_page( + '''{ + const tree = pattern_tree.make(); + for (const pattern of arguments[0].concat(arguments[1])) { + pattern_tree.register(tree, pattern, 'key', pattern); + pattern_tree.register(tree, pattern + '/', 'key', pattern + '/'); + } + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + patterns, bad_patterns, url) + assert expected == result + + # Also verify that deregistering all patterns with trailing slash works + # correctly. + expected = [{'key': p} for p in patterns] + tree, result = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1]) + pattern_tree.deregister(tree, pattern + '/', 'key'); + returnval([tree, [...pattern_tree.search(tree, arguments[2])]]); + }''', + tree, patterns, url) + assert expected == result + + # Also verify that deregistering all the patterns works correctly. + tree = execute_in_page( + '''{ + const tree = arguments[0]; + for (const pattern of arguments[1]) + pattern_tree.deregister(tree, pattern, 'key'); + for (const pattern of arguments[2]) { + pattern_tree.deregister(tree, pattern, 'key'); + pattern_tree.deregister(tree, pattern + '/', 'key'); + } + returnval(tree); + }''', + tree, patterns, bad_patterns) + assert tree == {} -- cgit v1.2.3