From 52d12a4fa124daa1595529e3e7008276a7986d95 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Mon, 13 Jun 2022 11:06:49 +0200 Subject: unfinished partial work --- src/hydrilla/proxy/__init__.py | 5 + src/hydrilla/proxy/addon.py | 177 +++++++++++++++++ src/hydrilla/proxy/flow_handlers.py | 383 ++++++++++++++++++++++++++++++++++++ src/hydrilla/proxy/policies.py | 76 +++++++ src/hydrilla/proxy/state.py | 73 +++++++ src/hydrilla/proxy/store.py | 40 ++++ 6 files changed, 754 insertions(+) create mode 100644 src/hydrilla/proxy/__init__.py create mode 100644 src/hydrilla/proxy/addon.py create mode 100644 src/hydrilla/proxy/flow_handlers.py create mode 100644 src/hydrilla/proxy/policies.py create mode 100644 src/hydrilla/proxy/state.py create mode 100644 src/hydrilla/proxy/store.py (limited to 'src/hydrilla/proxy') diff --git a/src/hydrilla/proxy/__init__.py b/src/hydrilla/proxy/__init__.py new file mode 100644 index 0000000..d382ead --- /dev/null +++ b/src/hydrilla/proxy/__init__.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: CC0-1.0 + +# Copyright (C) 2022 Wojtek Kosior +# +# Available under the terms of Creative Commons Zero v1.0 Universal. diff --git a/src/hydrilla/proxy/addon.py b/src/hydrilla/proxy/addon.py new file mode 100644 index 0000000..7d6487b --- /dev/null +++ b/src/hydrilla/proxy/addon.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# Haketilo addon for Mitmproxy. +# +# This file is part of Hydrilla&Haketilo. +# +# Copyright (C) 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +""" +This module contains the definition of a mitmproxy addon that gets instantiated +from addon script. +""" + +# Enable using with Python 3.7. +from __future__ import annotations + +import os.path +import typing as t +import dataclasses as dc + +from threading import Lock +from pathlib import Path +from contextlib import contextmanager + +from mitmproxy import http, addonmanager, ctx +from mitmproxy.script import concurrent + +from .flow_handlers import make_flow_handler, FlowHandler +from .state import HaketiloState +from ..translations import smart_gettext as _ + +FlowHandlers = dict[int, FlowHandler] + +StateUpdater = t.Callable[[HaketiloState], None] + +HTTPHandlerFun = t.Callable[ + ['HaketiloAddon', http.HTTPFlow], + t.Optional[StateUpdater] +] + +def http_event_handler(handler_fun: HTTPHandlerFun): + """....decorator""" + def wrapped_handler(self: 'HaketiloAddon', flow: http.HTTPFlow): + """....""" + with self.configured_lock: + assert self.configured + + assert self.state is not None + + state_updater = handler_fun(self, flow) + + if state_updater is not None: + state_updater(self.state) + + return wrapped_handler + +@dc.dataclass +class HaketiloAddon: + """ + ....... + """ + configured: bool = False + configured_lock: Lock = dc.field(default_factory=Lock) + + state: t.Optional[HaketiloState] = None + + flow_handlers: FlowHandlers = dc.field(default_factory=dict) + handlers_lock: Lock = dc.field(default_factory=Lock) + + def load(self, loader: addonmanager.Loader) -> None: + """....""" + loader.add_option( + name = 'haketilo_dir', + typespec = str, + default = '~/.haketilo/', + help = "Point to a Haketilo data directory to use", + ) + + def configure(self, updated: set[str]) -> None: + """....""" + if 'haketilo_dir' not in updated: + return + + with self.configured_lock: + if self.configured: + ctx.log.warn(_('haketilo_dir_already_configured')) + return + + haketilo_dir = Path(ctx.options.haketilo_dir) + self.state = HaketiloState(haketilo_dir / 'store') + + def assign_handler(self, flow: http.HTTPFlow, flow_handler: FlowHandler) \ + -> None: + """....""" + with self.handlers_lock: + self.flow_handlers[id(flow)] = flow_handler + + def lookup_handler(self, flow: http.HTTPFlow) -> FlowHandler: + """....""" + with self.handlers_lock: + return self.flow_handlers[id(flow)] + + def forget_handler(self, flow: http.HTTPFlow) -> None: + """....""" + with self.handlers_lock: + self.flow_handlers.pop(id(flow), None) + + @concurrent + @http_event_handler + def requestheaders(self, flow: http.HTTPFlow) -> t.Optional[StateUpdater]: + """ + ..... + """ + assert self.state is not None + + policy = self.state.select_policy(flow.request.url) + + flow_handler = make_flow_handler(flow, policy) + + self.assign_handler(flow, flow_handler) + + return flow_handler.on_requestheaders() + + @concurrent + @http_event_handler + def request(self, flow: http.HTTPFlow) -> t.Optional[StateUpdater]: + """ + .... + """ + return self.lookup_handler(flow).on_request() + + @concurrent + @http_event_handler + def responseheaders(self, flow: http.HTTPFlow) -> t.Optional[StateUpdater]: + """ + ...... + """ + return self.lookup_handler(flow).on_responseheaders() + + @concurrent + @http_event_handler + def response(self, flow: http.HTTPFlow) -> t.Optional[StateUpdater]: + """ + ...... + """ + updater = self.lookup_handler(flow).on_response() + + self.forget_handler(flow) + + return updater + + @http_event_handler + def error(self, flow: http.HTTPFlow) -> None: + """....""" + self.forget_handler(flow) + +addons = [ + HaketiloAddon() +] diff --git a/src/hydrilla/proxy/flow_handlers.py b/src/hydrilla/proxy/flow_handlers.py new file mode 100644 index 0000000..605c7f9 --- /dev/null +++ b/src/hydrilla/proxy/flow_handlers.py @@ -0,0 +1,383 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# Logic for modifying mitmproxy's HTTP flows. +# +# This file is part of Hydrilla&Haketilo. +# +# Copyright (C) 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +""" +This module's file gets passed to Mitmproxy as addon script and makes it serve +as Haketilo proxy. +""" + +# Enable using with Python 3.7. +from __future__ import annotations + +import re +import typing as t +import dataclasses as dc + +import bs4 # type: ignore + +from mitmproxy import http +from mitmproxy.net.http import Headers +from mitmproxy.script import concurrent + +from .state import HaketiloState +from . import policies + +StateUpdater = t.Callable[[HaketiloState], None] + +@dc.dataclass(frozen=True) +class FlowHandler: + """....""" + flow: http.HTTPFlow + policy: policies.Policy + + stream_request: bool = False + stream_response: bool = False + + def on_requestheaders(self) -> t.Optional[StateUpdater]: + """....""" + if self.stream_request: + self.flow.request.stream = True + + return None + + def on_request(self) -> t.Optional[StateUpdater]: + """....""" + return None + + def on_responseheaders(self) -> t.Optional[StateUpdater]: + """....""" + assert self.flow.response is not None + + if self.stream_response: + self.flow.response.stream = True + + return None + + def on_response(self) -> t.Optional[StateUpdater]: + """....""" + return None + +@dc.dataclass(frozen=True) +class FlowHandlerAllowScripts(FlowHandler): + """....""" + policy: policies.AllowPolicy + + stream_request: bool = True + stream_response: bool = True + +csp_header_names_and_dispositions = ( + ('content-security-policy', 'enforce'), + ('content-security-policy-report-only', 'report'), + ('x-content-security-policy', 'enforce'), + ('x-content-security-policy', 'report'), + ('x-webkit-csp', 'enforce'), + ('x-webkit-csp', 'report') +) + +csp_enforce_header_names_set = { + name for name, disposition in csp_header_names_and_dispositions + if disposition == 'enforce' +} + +@dc.dataclass +class ContentSecurityPolicy: + directives: dict[str, list[str]] + header_name: str + disposition: str + + @staticmethod + def deserialize( + serialized: str, + header_name: str, + disposition: str = 'enforce' + ) -> 'ContentSecurityPolicy': + """....""" + # For more info, see: + # https://www.w3.org/TR/CSP3/#parse-serialized-policy + directives = {} + + for serialized_directive in serialized.split(';'): + if not serialized_directive.isascii(): + continue + + serialized_directive = serialized_directive.strip() + if len(serialized_directive) == 0: + continue + + tokens = serialized_directive.split() + directive_name = tokens.pop(0).lower() + directive_value = tokens + + # Specs mention giving warnings for duplicate directive names but + # from our proxy's perspective this is not important right now. + if directive_name in directives: + continue + + directives[directive_name] = directive_value + + return ContentSecurityPolicy(directives, header_name, disposition) + + def serialize(self) -> str: + """....""" + serialized_directives = [] + for name, value_list in self.directives.items(): + serialized_directives.append(f'{name} {" ".join(value_list)}') + + return ';'.join(serialized_directives) + +def extract_csp(headers: Headers) -> tuple[ContentSecurityPolicy, ...]: + """....""" + csp_policies = [] + + for header_name, disposition in csp_header_names_and_dispositions: + for serialized_list in headers.get(header_name, ''): + for serialized in serialized_list.split(','): + policy = ContentSecurityPolicy.deserialize( + serialized, + header_name, + disposition + ) + + if policy.directives != {}: + csp_policies.append(policy) + + return tuple(csp_policies) + +csp_script_directive_names = ( + 'script-src', + 'script-src-elem', + 'script-src-attr' +) + +@dc.dataclass(frozen=True) +class FlowHandlerBlockScripts(FlowHandler): + policy: policies.BlockPolicy + + stream_request: bool = True + stream_response: bool = True + + def on_responseheaders(self) -> t.Optional[StateUpdater]: + """....""" + super().on_responseheaders() + + assert self.flow.response is not None + + csp_policies = extract_csp(self.flow.response.headers) + + for header_name, _ in csp_header_names_and_dispositions: + del self.flow.response.headers[header_name] + + for policy in csp_policies: + if policy.disposition != 'enforce': + continue + + policy.directives.pop('report-to') + policy.directives.pop('report-uri') + + self.flow.response.headers.add( + policy.header_name, + policy.serialize() + ) + + extra_csp = ';'.join(( + "script-src 'none'", + "script-src-elem 'none'", + "script-src-attr 'none'" + )) + + self.flow.response.headers.add('Content-Security-Policy', extra_csp) + + return None + +# For details of 'Content-Type' header's structure, see: +# https://datatracker.ietf.org/doc/html/rfc7231#section-3.1.1.1 +content_type_reg = re.compile(r''' +^ +(?P[\w-]+/[\w-]+) +\s* +(?: + ; + (?:[^;]*;)* # match possible parameter other than "charset" +) +\s* +charset= # no whitespace allowed in parameter as per RFC +(?P + [\w-]+ + | + "[\w-]+" # quotes are optional per RFC +) +(?:;[^;]+)* # match possible parameter other than "charset" +$ # forbid possible dangling characters after closing '"' +''', re.VERBOSE | re.IGNORECASE) + +def deduce_content_type(headers: Headers) \ + -> tuple[t.Optional[str], t.Optional[str]]: + """....""" + content_type = headers.get('content-type') + if content_type is None: + return (None, None) + + match = content_type_reg.match(content_type) + if match is None: + return (None, None) + + mime, encoding = match.group('mime'), match.group('encoding') + + if encoding is not None: + encoding = encoding.lower() + + return mime, encoding + +UTF8_BOM = b'\xEF\xBB\xBF' +BOMs = ( + (UTF8_BOM, 'utf-8'), + (b'\xFE\xFF', 'utf-16be'), + (b'\xFF\xFE', 'utf-16le') +) + +def block_attr(element: bs4.PageElement, atrr_name: str) -> None: + """....""" + # TODO: implement + pass + +@dc.dataclass(frozen=True) +class FlowHandlerInjectPayload(FlowHandler): + """....""" + policy: policies.PayloadPolicy + + stream_request: bool = True + + def __post_init__(self) -> None: + """....""" + script_src = f"script-src {self.policy.assets_base_url()}" + if self.policy.is_eval_allowed(): + script_src = f"{script_src} 'unsafe-eval'" + + self.new_csp = '; '.join(( + script_src, + "script-src-elem 'none'", + "script-src-attr 'none'" + )) + + def on_responseheaders(self) -> t.Optional[StateUpdater]: + """....""" + super().on_responseheaders() + + assert self.flow.response is not None + + for header_name, _ in csp_header_names_and_dispositions: + del self.flow.response.headers[header_name] + + self.flow.response.headers.add('Content-Security-Policy', self.new_csp) + + return None + + def on_response(self) -> t.Optional[StateUpdater]: + """....""" + super().on_response() + + assert self.flow.response is not None + + if self.flow.response.content is None: + return None + + mime, encoding = deduce_content_type(self.flow.response.headers) + if mime is None or 'html' not in mime: + return None + + # A UTF BOM overrides encoding specified by the header. + for bom, encoding_name in BOMs: + if self.flow.response.content.startswith(bom): + encoding = encoding_name + + soup = bs4.BeautifulSoup( + markup = self.flow.response.content, + from_encoding = encoding, + features = 'html5lib' + ) + + # Inject scripts. + script_parent = soup.find('body') or soup.find('html') + if script_parent is None: + return None + + for url in self.policy.script_urls(): + script_parent.append(bs4.Tag(name='script', attrs={'src': url})) + + # Remove Content Security Policy that could possibly block injected + # scripts. + for meta in soup.select('head meta[http-equiv]'): + header_name = meta.attrs.get('http-equiv', '').lower().strip() + if header_name in csp_enforce_header_names_set: + block_attr(meta, 'http-equiv') + block_attr(meta, 'content') + + # Appending a three-byte Byte Order Mark (BOM) will force the browser to + # decode this as UTF-8 regardless of the 'Content-Type' header. See: + # https://www.w3.org/International/tests/repository/html5/the-input-byte-stream/results-basics#precedence + self.flow.response.content = UTF8_BOM + soup.encode() + + return None + +@dc.dataclass(frozen=True) +class FlowHandlerMetaResource(FlowHandler): + """....""" + policy: policies.MetaResourcePolicy + + def on_request(self) -> t.Optional[StateUpdater]: + """....""" + super().on_request() + # TODO: implement + #self.flow.response = .... + + return None + +def make_flow_handler(flow: http.HTTPFlow, policy: policies.Policy) \ + -> FlowHandler: + """....""" + if isinstance(policy, policies.BlockPolicy): + return FlowHandlerBlockScripts(flow, policy) + + if isinstance(policy, policies.AllowPolicy): + return FlowHandlerAllowScripts(flow, policy) + + if isinstance(policy, policies.PayloadPolicy): + return FlowHandlerInjectPayload(flow, policy) + + assert isinstance(policy, policies.MetaResourcePolicy) + # def response_creator(request: http.HTTPRequest) -> http.HTTPResponse: + # """....""" + # replacement_details = make_replacement_resource( + # policy.replacement, + # request.path + # ) + + # return http.HTTPResponse.make( + # replacement_details.status_code, + # replacement_details.content, + # replacement_details.content_type + # ) + return FlowHandlerMetaResource(flow, policy) diff --git a/src/hydrilla/proxy/policies.py b/src/hydrilla/proxy/policies.py new file mode 100644 index 0000000..5e9451b --- /dev/null +++ b/src/hydrilla/proxy/policies.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# Various policies for altering HTTP requests. +# +# This file is part of Hydrilla&Haketilo. +# +# Copyright (C) 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +import dataclasses as dc +import typing as t + +from abc import ABC + +class Policy(ABC): + pass + +class PayloadPolicy(Policy): + """....""" + def assets_base_url(self) -> str: + """....""" + return 'https://example.com/static/' + + def script_urls(self) -> t.Sequence[str]: + """....""" + # TODO: implement + return ('https://example.com/static/somescript.js',) + + def is_eval_allowed(self) -> bool: + """....""" + # TODO: implement + return True + +class MetaResourcePolicy(Policy): + pass + +class AllowPolicy(Policy): + pass + +@dc.dataclass +class RuleAllowPolicy(AllowPolicy): + pattern: str + +class FallbackAllowPolicy(AllowPolicy): + pass + +class BlockPolicy(Policy): + pass + +@dc.dataclass +class RuleBlockPolicy(BlockPolicy): + pattern: str + +class FallbackBlockPolicy(BlockPolicy): + pass + +@dc.dataclass +class ErrorBlockPolicy(BlockPolicy): + error: Exception diff --git a/src/hydrilla/proxy/state.py b/src/hydrilla/proxy/state.py new file mode 100644 index 0000000..fc01536 --- /dev/null +++ b/src/hydrilla/proxy/state.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# Haketilo proxy data and configuration. +# +# This file is part of Hydrilla&Haketilo. +# +# Copyright (C) 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +""" +This module contains logic for keeping track of all settings, rules, mappings +and resources. +""" + +# Enable using with Python 3.7. +from __future__ import annotations + +import typing as t +import dataclasses as dc + +from threading import Lock +from pathlib import Path + +from ..pattern_tree import PatternTree +from .store import HaketiloStore +from . import policies + +def make_pattern_tree_with_builtin_policies() -> PatternTree[policies.Policy]: + """....""" + # TODO: implement + return PatternTree() + +tree_field = dc.field(default_factory=make_pattern_tree_with_builtin_policies) + +@dc.dataclass +class HaketiloState(HaketiloStore): + """....""" + pattern_tree: PatternTree[policies.Policy] = tree_field + default_allow: bool = False + + state_lock: Lock = dc.field(default_factory=Lock) + + def select_policy(self, url: str, allow_disabled=False) -> policies.Policy: + """....""" + with self.state_lock: + pattern_tree = self.pattern_tree + + try: + for policy_set in pattern_tree.search(url): + # if policy.enabled or allow_disabled: + # return policy + pass + + return policies.FallbackBlockPolicy() + except Exception as e: + return policies.ErrorBlockPolicy(e) diff --git a/src/hydrilla/proxy/store.py b/src/hydrilla/proxy/store.py new file mode 100644 index 0000000..72852d8 --- /dev/null +++ b/src/hydrilla/proxy/store.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# Haketilo proxy on-disk data storage. +# +# This file is part of Hydrilla&Haketilo. +# +# Copyright (C) 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +"""This module facilitates storing and modifying Haketilo proxy data on-disk.""" + +# Enable using with Python 3.7. +from __future__ import annotations + +import dataclasses as dc + +from pathlib import Path + +@dc.dataclass +class HaketiloStore: + """....""" + store_dir: Path + # TODO: implement -- cgit v1.2.3