diff options
author | Wojtek Kosior <koszko@koszko.org> | 2022-10-07 10:30:38 +0200 |
---|---|---|
committer | Wojtek Kosior <koszko@koszko.org> | 2022-10-07 11:15:19 +0200 |
commit | 2e5b690e84fd168ba169c17581f59b8e1d1f430e (patch) | |
tree | 8239ed9531e9e5b41af1468f123092037c7ec5eb /src/hydrilla/proxy/policies | |
parent | 48f80ae480e2fc0eabbdb5041e841b80c0f788f4 (diff) | |
download | haketilo-hydrilla-2e5b690e84fd168ba169c17581f59b8e1d1f430e.tar.gz haketilo-hydrilla-2e5b690e84fd168ba169c17581f59b8e1d1f430e.zip |
[proxy] support unrestricted HTTP requests API in Haketilo payloads
Diffstat (limited to 'src/hydrilla/proxy/policies')
-rw-r--r-- | src/hydrilla/proxy/policies/base.py | 13 | ||||
-rw-r--r-- | src/hydrilla/proxy/policies/js_templates/page_init_script.js.jinja | 145 | ||||
-rw-r--r-- | src/hydrilla/proxy/policies/payload.py | 6 | ||||
-rw-r--r-- | src/hydrilla/proxy/policies/payload_resource.py | 289 | ||||
-rw-r--r-- | src/hydrilla/proxy/policies/rule.py | 2 | ||||
-rw-r--r-- | src/hydrilla/proxy/policies/web_ui.py | 2 |
6 files changed, 435 insertions, 22 deletions
diff --git a/src/hydrilla/proxy/policies/base.py b/src/hydrilla/proxy/policies/base.py index b7beba3..c02ea0b 100644 --- a/src/hydrilla/proxy/policies/base.py +++ b/src/hydrilla/proxy/policies/base.py @@ -40,6 +40,7 @@ from abc import ABC, abstractmethod from immutables import Map +from ... url_patterns import ParsedUrl from .. import state from .. import http_messages @@ -57,12 +58,18 @@ ProducedMessage = t.Union[ class Policy(ABC): """....""" - process_request: t.ClassVar[bool] = False - process_response: t.ClassVar[bool] = False - anticache: t.ClassVar[bool] = True + _process_request: t.ClassVar[bool] = False + _process_response: t.ClassVar[bool] = False + anticache: t.ClassVar[bool] = True priority: t.ClassVar[PolicyPriority] + def should_process_request(self, parsed_url: ParsedUrl) -> bool: + return self._process_request + + def should_process_response(self, parsed_url: ParsedUrl) -> bool: + return self._process_response + def consume_request(self, request_info: http_messages.RequestInfo) \ -> t.Optional[ProducedMessage]: raise NotImplementedError( diff --git a/src/hydrilla/proxy/policies/js_templates/page_init_script.js.jinja b/src/hydrilla/proxy/policies/js_templates/page_init_script.js.jinja new file mode 100644 index 0000000..3a8382c --- /dev/null +++ b/src/hydrilla/proxy/policies/js_templates/page_init_script.js.jinja @@ -0,0 +1,145 @@ +{# +SPDX-License-Identifier: GPL-3.0-or-later + +Haketilo page APIs code template. + +This file is part of Hydrilla&Haketilo. + +Copyright (C) 2021,2022 Wojtek Kosior + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +As additional permission under GNU GPL version 3 section 7, you +may distribute forms of that code without the copy of the GNU +GPL normally required by section 4, provided you include this +license notice and, in case of non-source distribution, a URL +through which recipients can access the Corresponding Source. +If you modify file(s) with this exception, you may extend this +exception to your version of the file(s), but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. + +As a special exception to the GPL, any HTML file which merely +makes function calls to this code, and for that purpose +includes it by reference shall be deemed a separate work for +copyright law purposes. If you modify this code, you may extend +this exception to your version of the code, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <https://www.gnu.org/licenses/>. + + +I, Wojtek Kosior, thereby promise not to sue for violation of this +file's license. Although I request that you do not make use of this +code in a proprietary program, I am not going to enforce this in court. +-#} + +(function(){ + /* + * Snapshot some variables that other code could theoretically redefine + * later. We're not making the effort to protect from redefinition of + * prototype properties right now. + */ + const console = window.console; + const fetch = window.fetch; + const JSON = window.JSON; + const URL = window.URL; + const Array = window.Array; + const Uint8Array = window.Uint8Array; + const CustomEvent = window.CustomEvent; + const window_dispatchEvent = window.dispatchEvent; + + /* Get values from the proxy. */ + function decode_jinja(str) { + return decodeURIComponent(atob(str)); + } + const unique_token = decode_jinja("{{ unique_token_encoded }}"); + const assets_base_url = decode_jinja("{{ assets_base_url_encoded }}"); + + /* Make it possible to serialize an Error object. */ + function error_data_jsonifiable(error) { + const jsonifiable = {}; + for (const property of ["name", "message", "fileName", "lineNumber"]) + jsonifiable[property] = error[property]; + + return jsonifiable; + } + + /* Make it possible to serialize a Uint8Array. */ + function uint8_to_hex(array) { + return [...array].map(b => ("0" + b.toString(16)).slice(-2)).join(""); + } + + async function on_unrestricted_http_request(event) { + const name = "haketilo_CORS_bypass"; + + if (typeof event.detail !== "object" || + event.detail === null || + typeof event.detail.id !== "string" || + typeof event.detail.data !== "string") { + console.error(`Unrestricted HTTP: Invalid detail.`, event.detail); + return; + } + + try { + const data = JSON.parse(event.detail.data); + + const params = new URLSearchParams({ + target_url: data.url, + extra_headers: JSON.stringify(data.headers || []) + }); + const replacement_url = assets_base_url + "api/unrestricted_http"; + const replacement_url_obj = new URL(replacement_url); + replacement_url_obj.search = params; + + const response = await fetch(replacement_url_obj.href, data.init); + const response_buffer = await response.arrayBuffer(); + + const true_headers_serialized = + response.headers.get("x-haketilo-true-headers"); + + if (true_headers_serialized === null) + throw new Error("Unrestricted HTTP: The 'X-Haketilo-True-Headers' HTTP response header is missing. Are we connected to Haketilo proxy?") + + const true_headers = JSON.parse( + decodeURIComponent(true_headers_serialized) + ); + + const bad_format_error_msg = + "Unrestricted HTTP: The 'X-Haketilo-True-Headers' HTTP response header has invalid format."; + + if (!Array.isArray(true_headers)) + throw new Error(bad_format_error_msg); + + for (const [header, value] of true_headers) { + if (typeof header !== "string" || typeof value !== "string") + throw new Error(bad_format_error_msg); + } + + var result = { + status: response.status, + statusText: response.statusText, + headers: true_headers, + body: uint8_to_hex(new Uint8Array(response_buffer)) + }; + } catch(e) { + var result = {error: error_data_jsonifiable(e)}; + } + + const response_name = `${name}-${event.detail.id}`; + const detail = JSON.stringify(result); + window_dispatchEvent(new CustomEvent(response_name, {detail})); +} + +window.addEventListener("haketilo_CORS_bypass", on_unrestricted_http_request); +})(); diff --git a/src/hydrilla/proxy/policies/payload.py b/src/hydrilla/proxy/policies/payload.py index c50bdef..7eef184 100644 --- a/src/hydrilla/proxy/policies/payload.py +++ b/src/hydrilla/proxy/policies/payload.py @@ -157,7 +157,7 @@ def block_attr(element: bs4.PageElement, attr_name: str) -> None: @dc.dataclass(frozen=True) class PayloadInjectPolicy(PayloadAwarePolicy): """....""" - process_response: t.ClassVar[bool] = True + _process_response: t.ClassVar[bool] = True priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._TWO @@ -192,6 +192,8 @@ class PayloadInjectPolicy(PayloadAwarePolicy): base_url = self.assets_base_url(url) payload_ref = self.payload_data.ref + yield base_url + 'api/page_init_script.js' + for path in payload_ref.get_script_paths(): yield base_url + '/'.join(('static', *path)) @@ -323,7 +325,7 @@ class AutoPayloadInjectPolicy(PayloadInjectPolicy): @dc.dataclass(frozen=True) class PayloadSuggestPolicy(PayloadAwarePolicy): """....""" - process_request: t.ClassVar[bool] = True + _process_request: t.ClassVar[bool] = True priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._ONE diff --git a/src/hydrilla/proxy/policies/payload_resource.py b/src/hydrilla/proxy/policies/payload_resource.py index 30b28f2..cda19ba 100644 --- a/src/hydrilla/proxy/policies/payload_resource.py +++ b/src/hydrilla/proxy/policies/payload_resource.py @@ -59,21 +59,195 @@ from __future__ import annotations import dataclasses as dc import typing as t +import json + +from threading import Lock +from base64 import b64encode +from urllib.parse import quote, parse_qs, urlparse, urlencode, urljoin + +import jinja2 from ...translations import smart_gettext as _ +from ...url_patterns import ParsedUrl from .. import state from .. import http_messages from . import base from .payload import PayloadAwarePolicy, PayloadAwarePolicyFactory +loader = jinja2.PackageLoader(__package__, package_path='js_templates') +jinja_env = jinja2.Environment( + loader = loader, + lstrip_blocks = True, + autoescape = False +) +jinja_lock = Lock() + + +def encode_string_for_js(string: str) -> str: + return b64encode(quote(string).encode()).decode() + + +AnyValue = t.TypeVar('AnyValue', bound=object) + +def header_keys(headers: t.Iterable[tuple[str, AnyValue]]) -> frozenset[str]: + return frozenset(header.lower() for header, _ in headers) + +def _merge_headers( + standard_headers: t.Iterable[tuple[str, t.Optional[str]]], + overridable_headers_keys: frozenset[str], + native_headers: http_messages.IHeaders, + extra_headers: t.Iterable[tuple[str, str]] +) -> t.Iterable[tuple[str, str]]: + standard_keys = header_keys(standard_headers) + standard_iterator = iter(standard_headers) + native_keys = header_keys(native_headers.items()) + + selected_base: list[tuple[str, str]] = [] + processed: set[str] = set() + + for header, _ in native_headers.items(): + header_l = header.lower() + + if header_l in processed or header_l not in standard_keys: + continue + + for standard_header_l, chosen_value in standard_iterator: + if standard_header_l not in native_keys: + if chosen_value is not None: + selected_base.append((standard_header_l, chosen_value)) + elif standard_header_l == header_l: + processed.add(header_l) + + if header_l in overridable_headers_keys: + chosen_value = native_headers.get(header_l, chosen_value) + + if chosen_value is not None: + selected_base.append((header, chosen_value)) + + break + + for standard_header_l, standard_value in standard_iterator: + if standard_value is not None: + selected_base.append((standard_header_l, standard_value)) + + extra_keys = header_keys(extra_headers) + extra_iterator = iter(extra_headers) + + result: list[tuple[str, str]] = [] + processed = set() + + for header, value in selected_base: + header_l = header.lower() + + if header_l in processed: + continue + + if header_l in extra_keys: + for extra_header, extra_value in extra_iterator: + extra_header_l = extra_header.lower() + + processed.add(extra_header_l) + + result.append((extra_header, extra_value)) + + if extra_header_l == header_l: + break + else: + result.append((header, value)) + + result.extend(extra_iterator) + + return result + +request_standard_headers: t.Iterable[tuple[str, t.Optional[str]]] = ( + ('user-agent', None), + ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'), + ('accept-language', 'en-US,en;q=0.5'), + ('accept-encoding', None), + ('dnt', '1'), + ('connection', None), + ('upgrade-insecure-requests', '1'), + ('sec-fetch-dest', 'document'), + ('sec-fetch-mode', 'navigate'), + ('sec-fetch-site', 'none'), + ('sec-fetch-user', '?1'), + ('te', 'trailers') +) + +auto_overridable_request_headers = frozenset(( + 'user-agent', + 'accept-language', + 'accept-encoding', + 'dnt' +)) + +def merge_request_headers( + native_headers: http_messages.IHeaders, + extra_headers: t.Iterable[tuple[str, str]] +) -> t.Iterable[tuple[str, str]]: + return _merge_headers( + standard_headers = request_standard_headers, + overridable_headers_keys = auto_overridable_request_headers, + native_headers = native_headers, + extra_headers = extra_headers + ) + +response_standard_headers: t.Iterable[tuple[str, t.Optional[str]]] = ( + ('cache-control', 'max-age=0, private, must-revalidate'), + ('connection', None), + ('content-length', None), + ('content-type', None), + ('date', None), + ('keep-alive', None), + ('server', None) +) + +auto_overridable_response_headers = frozenset( + header.lower() + for header, value in response_standard_headers + if value is None +) + +def merge_response_headers( + native_headers: http_messages.IHeaders, + extra_headers: t.Iterable[tuple[str, str]] +) -> t.Iterable[tuple[str, str]]: + return _merge_headers( + standard_headers = response_standard_headers, + overridable_headers_keys = auto_overridable_response_headers, + native_headers = native_headers, + extra_headers = extra_headers + ) + + +ProducedAny = t.Union[ + http_messages.ProducedResponse, + http_messages.ProducedRequest +] + @dc.dataclass(frozen=True) class PayloadResourcePolicy(PayloadAwarePolicy): """....""" - process_request: t.ClassVar[bool] = True + _process_request: t.ClassVar[bool] = True priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._THREE + def extract_resource_path(self, request_url: ParsedUrl) -> tuple[str, ...]: + # Payload resource pattern has path of the form: + # "/some/arbitrary/segments/<per-session_token>/***" + # + # Corresponding requests shall have path of the form: + # "/some/arbitrary/segments/<per-session_token>/actual/resource/path" + # + # Here we need to extract the "/actual/resource/path" part. + segments_to_drop = len(self.payload_data.pattern_path_segments) + 1 + return request_url.path_segments[segments_to_drop:] + + def should_process_response(self, request_url: ParsedUrl) -> bool: + return self.extract_resource_path(request_url) \ + == ('api', 'unrestricted_http') + def _make_file_resource_response(self, path: tuple[str, ...]) \ -> http_messages.ProducedResponse: """....""" @@ -95,29 +269,114 @@ class PayloadResourcePolicy(PayloadAwarePolicy): file_data.contents ) + def _make_api_response( + self, + path: tuple[str, ...], + request_info: http_messages.RequestInfo + ) -> ProducedAny: + if path[0] == 'page_init_script.js': + with jinja_lock: + template = jinja_env.get_template('page_init_script.js.jinja') + token = self.payload_data.unique_token + base_url = self.assets_base_url(request_info.url) + js = template.render( + unique_token_encoded = encode_string_for_js(token), + assets_base_url_encoded = encode_string_for_js(base_url) + ) + + return http_messages.ProducedResponse( + 200, + ((b'Content-Type', b'application/javascript'),), + js.encode() + ) + + if path[0] == 'unrestricted_http': + try: + assert self.payload_data.cors_bypass_allowed + + params = parse_qs(request_info.url.query) + target_url, = params['target_url'] + extra_headers_str, = params['extra_headers'] + + assert urlparse(target_url).scheme in ('http', 'https') + + extra_headers = json.loads(extra_headers_str) + assert isinstance(extra_headers, list) + for header, value in extra_headers: + assert isinstance(header, str) + assert isinstance(value, str) + + result_headers = merge_request_headers( + native_headers = request_info.headers, + extra_headers = extra_headers + ) + + result_headers_bytes = \ + [(h.encode(), v.encode()) for h, v in result_headers] + + return http_messages.ProducedRequest( + url = target_url, + method = request_info.method, + headers = result_headers_bytes, + body = request_info.body + ) + except: + return resource_blocked_response + else: + return resource_blocked_response + def consume_request(self, request_info: http_messages.RequestInfo) \ - -> http_messages.ProducedResponse: - """....""" - # Payload resource pattern has path of the form: - # "/some/arbitrary/segments/<per-session_token>/***" - # - # Corresponding requests shall have path of the form: - # "/some/arbitrary/segments/<per-session_token>/actual/resource/path" - # - # Here we need to extract the "/actual/resource/path" part. - segments_to_drop = len(self.payload_data.pattern_path_segments) + 1 - resource_path = request_info.url.path_segments[segments_to_drop:] + -> ProducedAny: + resource_path = self.extract_resource_path(request_info.url) if resource_path == (): return resource_blocked_response elif resource_path[0] == 'static': return self._make_file_resource_response(resource_path[1:]) elif resource_path[0] == 'api': - # TODO: implement Haketilo APIs - return resource_blocked_response + return self._make_api_response(resource_path[1:], request_info) else: return resource_blocked_response + def consume_response(self, response_info: http_messages.ResponseInfo) \ + -> http_messages.ProducedResponse: + """ + This method shall only be called for responses to unrestricted HTTP API + requests. Its purpose is to sanitize response headers and smuggle their + original data using an additional header. + """ + serialized = json.dumps([*response_info.headers.items()]) + extra_headers = [('X-Haketilo-True-Headers', quote(serialized)),] + + if (300 <= response_info.status_code < 400): + location = response_info.headers.get('location') + if location is not None: + orig_params = parse_qs(response_info.orig_url.query) + orig_extra_headers_str, = orig_params['extra_headers'] + + new_query = urlencode({ + 'target_url': location, + 'extra_headers': orig_extra_headers_str + }) + + new_url = urljoin( + response_info.orig_url.orig_url, + '?' + new_query + ) + + extra_headers.append(('location', new_url)) + + merged_headers = merge_response_headers( + native_headers = response_info.headers, + extra_headers = extra_headers + ) + + return http_messages.ProducedResponse( + status_code = response_info.status_code, + headers = [(h.encode(), v.encode()) for h, v in merged_headers], + body = response_info.body, + ) + resource_blocked_response = http_messages.ProducedResponse( 403, @@ -128,7 +387,7 @@ resource_blocked_response = http_messages.ProducedResponse( @dc.dataclass(frozen=True) class BlockedResponsePolicy(base.Policy): """....""" - process_request: t.ClassVar[bool] = True + _process_request: t.ClassVar[bool] = True priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._THREE diff --git a/src/hydrilla/proxy/policies/rule.py b/src/hydrilla/proxy/policies/rule.py index 6482e84..833d287 100644 --- a/src/hydrilla/proxy/policies/rule.py +++ b/src/hydrilla/proxy/policies/rule.py @@ -48,7 +48,7 @@ class AllowPolicy(base.Policy): class BlockPolicy(base.Policy): """....""" - process_response: t.ClassVar[bool] = True + _process_response: t.ClassVar[bool] = True priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._TWO diff --git a/src/hydrilla/proxy/policies/web_ui.py b/src/hydrilla/proxy/policies/web_ui.py index 2b1ae02..9f6c0f5 100644 --- a/src/hydrilla/proxy/policies/web_ui.py +++ b/src/hydrilla/proxy/policies/web_ui.py @@ -45,7 +45,7 @@ from . import base @dc.dataclass(frozen=True) class WebUIPolicy(base.Policy): """....""" - process_request: t.ClassVar[bool] = True + _process_request: t.ClassVar[bool] = True priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._THREE |