unfinished partial work

author: Wojtek Kosior <koszko@koszko.org> 2022-07-27 15:56:24 +0200
committer: Wojtek Kosior <koszko@koszko.org> 2022-08-10 17:25:05 +0200
commit: 879c41927171efc8d77d1de2739b18e2eb57580f (patch)
tree: de0e78afe2ea49e58c9bf2c662657392a00139ee /src/hydrilla/proxy/policies/payload.py
parent: 52d12a4fa124daa1595529e3e7008276a7986d95 (diff)
download: haketilo-hydrilla-879c41927171efc8d77d1de2739b18e2eb57580f.tar.gz
haketilo-hydrilla-879c41927171efc8d77d1de2739b18e2eb57580f.zip
1 files changed, 315 insertions, 0 deletions
diff --git a/src/hydrilla/proxy/policies/payload.py b/src/hydrilla/proxy/policies/payload.py
new file mode 100644
index 0000000..d616f1b
--- /dev/null
+++ b/src/hydrilla/proxy/policies/payload.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# Policies for applying payload injections to HTTP requests.
+#
+# This file is part of Hydrilla&Haketilo.
+#
+# Copyright (C) 2022 Wojtek Kosior
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+#
+# I, Wojtek Kosior, thereby promise not to sue for violation of this
+# file's license. Although I request that you do not make use this code
+# in a proprietary program, I am not going to enforce this in court.
+
+"""
+.....
+"""
+
+# Enable using with Python 3.7.
+from __future__ import annotations
+
+import dataclasses as dc
+import typing as t
+import re
+
+import bs4 # type: ignore
+
+from ...url_patterns import ParsedUrl
+from .. import state
+from .. import csp
+from . import base
+
+@dc.dataclass(frozen=True) # type: ignore[misc]
+class PayloadAwarePolicy(base.Policy):
+    """...."""
+    haketilo_state: state.HaketiloState
+    payload_data:   state.PayloadData
+
+    def assets_base_url(self, request_url: ParsedUrl):
+        """...."""
+        token = self.payload_data.unique_token
+
+        base_path_segments = (*self.payload_data.pattern.path_segments, token)
+
+        return f'{request_url.url_without_path}/{"/".join(base_path_segments)}/'
+
+
+@dc.dataclass(frozen=True) # type: ignore[misc]
+class PayloadAwarePolicyFactory(base.PolicyFactory):
+    """...."""
+    payload_key: state.PayloadKey
+
+    @property
+    def payload_ref(self) -> state.PayloadRef:
+        """...."""
+        return self.payload_key.payload_ref
+
+    def __lt__(self, other: base.PolicyFactory) -> bool:
+        """...."""
+        if isinstance(other, type(self)):
+            return self.payload_key < other.payload_key
+
+        return super().__lt__(other)
+
+
+# For details of 'Content-Type' header's structure, see:
+# https://datatracker.ietf.org/doc/html/rfc7231#section-3.1.1.1
+content_type_reg = re.compile(r'''
+^
+(?P<mime>[\w-]+/[\w-]+)
+\s*
+(?:
+    ;
+    (?:[^;]*;)* # match possible parameter other than "charset"
+)
+\s*
+charset=        # no whitespace allowed in parameter as per RFC
+(?P<encoding>
+    [\w-]+
+    |
+    "[\w-]+"    # quotes are optional per RFC
+)
+(?:;[^;]+)*     # match possible parameter other than "charset"
+$               # forbid possible dangling characters after closing '"'
+''', re.VERBOSE | re.IGNORECASE)
+
+def deduce_content_type(headers: base.IHeaders) \
+    -> tuple[t.Optional[str], t.Optional[str]]:
+    """...."""
+    content_type = headers.get('content-type')
+    if content_type is None:
+        return (None, None)
+
+    match = content_type_reg.match(content_type)
+    if match is None:
+        return (None, None)
+
+    mime, encoding = match.group('mime'), match.group('encoding')
+
+    if encoding is not None:
+        encoding = encoding.lower()
+
+    return mime, encoding
+
+UTF8_BOM = b'\xEF\xBB\xBF'
+BOMs = (
+    (UTF8_BOM,    'utf-8'),
+    (b'\xFE\xFF', 'utf-16be'),
+    (b'\xFF\xFE', 'utf-16le')
+)
+
+def block_attr(element: bs4.PageElement, attr_name: str) -> None:
+    """
+    Disable HTML node attributes by prepending `blocked-'. This allows them to
+    still be relatively easily accessed in case they contain some useful data.
+    """
+    blocked_value = element.attrs.pop(attr_name, None)
+
+    while blocked_value is not None:
+        attr_name = f'blocked-{attr_name}'
+        next_blocked_value = element.attrs.pop(attr_name, None)
+        element.attrs[attr_name] = blocked_value
+
+        blocked_value = next_blocked_value
+
+@dc.dataclass(frozen=True)
+class PayloadInjectPolicy(PayloadAwarePolicy):
+    """...."""
+    process_response: t.ClassVar[bool] = True
+
+    priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._TWO
+
+    def _new_csp(self, request_url: ParsedUrl) -> str:
+        """...."""
+        assets_base = self.assets_base_url(request_url)
+
+        script_src = f"script-src {assets_base}"
+
+        if self.payload_data.eval_allowed:
+            script_src = f"{script_src} 'unsafe-eval'"
+
+        return '; '.join((
+            script_src,
+            "script-src-elem 'none'",
+            "script-src-attr 'none'"
+        ))
+
+    def _modify_headers(self, response_info: base.ResponseInfo) \
+        -> t.Iterable[tuple[bytes, bytes]]:
+        """...."""
+        for header_name, header_value in response_info.headers.items():
+            if header_name.lower() not in csp.header_names_and_dispositions:
+                yield header_name.encode(), header_value.encode()
+
+        new_csp = self._new_csp(response_info.url)
+
+        yield b'Content-Security-Policy', new_csp.encode()
+
+    def _script_urls(self, url: ParsedUrl) -> t.Iterable[str]:
+        """...."""
+        base_url = self.assets_base_url(url)
+        payload_ref = self.payload_data.payload_ref
+
+        for path in payload_ref.get_script_paths(self.haketilo_state):
+            yield base_url + '/'.join(('static', *path))
+
+    def _modify_body(
+            self,
+            url:      ParsedUrl,
+            body:     bytes,
+            encoding: t.Optional[str]
+    ) -> bytes:
+        """...."""
+        soup = bs4.BeautifulSoup(
+            markup        = body,
+            from_encoding = encoding,
+            features      = 'html5lib'
+        )
+
+        # Inject scripts.
+        script_parent = soup.find('body') or soup.find('html')
+        if script_parent is None:
+            return body
+
+        for script_url in self._script_urls(url):
+            tag = bs4.Tag(name='script', attrs={'src': script_url})
+            script_parent.append(tag)
+
+        # Remove Content Security Policy that could possibly block injected
+        # scripts.
+        for meta in soup.select('head meta[http-equiv]'):
+            header_name = meta.attrs.get('http-equiv', '').lower().strip()
+            if header_name in csp.enforce_header_names_set:
+                block_attr(meta, 'http-equiv')
+                block_attr(meta, 'content')
+
+        # Appending a three-byte Byte Order Mark (BOM) will force the browser to
+        # decode this as UTF-8 regardless of the 'Content-Type' header. See:
+        # https://www.w3.org/International/tests/repository/html5/the-input-byte-stream/results-basics#precedence
+        return UTF8_BOM + soup.encode()
+
+    def _consume_response_unsafe(self, response_info: base.ResponseInfo) \
+        -> base.ProducedResponse:
+        """...."""
+        new_response = response_info.make_produced_response()
+
+        new_headers = self._modify_headers(response_info)
+
+        new_response = dc.replace(new_response, headers=new_headers)
+
+        mime, encoding = deduce_content_type(response_info.headers)
+        if mime is None or 'html' not in mime.lower():
+            return new_response
+
+        data = response_info.body
+        if data is None:
+            data = b''
+
+        # A UTF BOM overrides encoding specified by the header.
+        for bom, encoding_name in BOMs:
+            if data.startswith(bom):
+                encoding = encoding_name
+
+        new_data = self._modify_body(response_info.url, data, encoding)
+
+        return dc.replace(new_response, body=new_data)
+
+    def consume_response(self, response_info: base.ResponseInfo) \
+        -> base.ProducedResponse:
+        """...."""
+        try:
+            return self._consume_response_unsafe(response_info)
+        except Exception as e:
+            # TODO: actually describe the errors
+            import traceback
+
+            error_info_list = traceback.format_exception(
+                type(e),
+                e,
+                e.__traceback__
+            )
+
+            return base.ProducedResponse(
+                500,
+                ((b'Content-Type', b'text/plain; charset=utf-8'),),
+                '\n'.join(error_info_list).encode()
+            )
+
+
+class AutoPayloadInjectPolicy(PayloadInjectPolicy):
+    """...."""
+    priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._ONE
+
+    def _modify_body(
+            self,
+            url:      ParsedUrl,
+            body:     bytes,
+            encoding: t.Optional[str]
+    ) -> bytes:
+        """...."""
+        payload_ref = self.payload_data.payload_ref
+        mapping_ref = payload_ref.get_mapping(self.haketilo_state)
+        mapping_ref.enable(self.haketilo_state)
+
+        return super()._modify_body(url, body, encoding)
+
+
+@dc.dataclass(frozen=True)
+class PayloadSuggestPolicy(PayloadAwarePolicy):
+    """...."""
+    priority: t.ClassVar[base.PolicyPriority] = base.PolicyPriority._ONE
+
+    def make_response(self, request_info: base.RequestInfo) \
+        -> base.ProducedResponse:
+        """...."""
+        # TODO: implement
+        return base.ProducedResponse(200, ((b'a', b'b'),), b'')
+
+
+@dc.dataclass(frozen=True, unsafe_hash=True) # type: ignore[misc]
+class PayloadPolicyFactory(PayloadAwarePolicyFactory):
+    """...."""
+    def make_policy(self, haketilo_state: state.HaketiloState) \
+        -> t.Optional[base.Policy]:
+        """...."""
+        try:
+            payload_data = self.payload_ref.get_data(haketilo_state)
+        except:
+            return None
+
+        if payload_data.explicitly_enabled:
+            return PayloadInjectPolicy(haketilo_state, payload_data)
+
+        mode = haketilo_state.get_settings().mapping_use_mode
+
+        if mode == state.MappingUseMode.QUESTION:
+            return PayloadSuggestPolicy(haketilo_state, payload_data)
+
+        if mode == state.MappingUseMode.WHEN_ENABLED:
+            return None
+
+        # mode == state.MappingUseMode.AUTO
+        return AutoPayloadInjectPolicy(haketilo_state, payload_data)
author	Wojtek Kosior <koszko@koszko.org>	2022-07-27 15:56:24 +0200
committer	Wojtek Kosior <koszko@koszko.org>	2022-08-10 17:25:05 +0200
commit	879c41927171efc8d77d1de2739b18e2eb57580f (patch)
tree	de0e78afe2ea49e58c9bf2c662657392a00139ee /src/hydrilla/proxy/policies/payload.py
parent	52d12a4fa124daa1595529e3e7008276a7986d95 (diff)
download	haketilo-hydrilla-879c41927171efc8d77d1de2739b18e2eb57580f.tar.gz haketilo-hydrilla-879c41927171efc8d77d1de2739b18e2eb57580f.zip