From 52d12a4fa124daa1595529e3e7008276a7986d95 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Mon, 13 Jun 2022 11:06:49 +0200 Subject: unfinished partial work --- src/hydrilla/url_patterns.py | 181 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 src/hydrilla/url_patterns.py (limited to 'src/hydrilla/url_patterns.py') diff --git a/src/hydrilla/url_patterns.py b/src/hydrilla/url_patterns.py new file mode 100644 index 0000000..8e80379 --- /dev/null +++ b/src/hydrilla/url_patterns.py @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# Data structure for querying URL patterns. +# +# This file is part of Hydrilla&Haketilo. +# +# Copyright (C) 2021, 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +""" +This module contains functions for deconstruction and construction of URLs and +Haketilo URL patterns. + +Data structures for querying data using URL patterns are also defined there. +""" + +# Enable using with Python 3.7. +from __future__ import annotations + +import re +import urllib.parse as up +import typing as t +import dataclasses as dc + +from immutables import Map + +from hydrilla.translations import smart_gettext as _ +from hydrilla.exceptions import HaketiloException + +default_ports: t.Mapping[str, int] = Map(http=80, https=443, ftp=21) + +@dc.dataclass(frozen=True, unsafe_hash=True) +class ParsedUrl: + """....""" + orig_url: str # orig_url used in __hash__() + scheme: str = dc.field(hash=False) + domain_labels: tuple[str, ...] = dc.field(hash=False) + path_segments: tuple[str, ...] = dc.field(hash=False) + has_trailing_slash: bool = dc.field(hash=False) + port: int = dc.field(hash=False) + + # def reconstruct_url(self) -> str: + # """....""" + # scheme = self.orig_scheme + + # netloc = '.'.join(reversed(self.domain_labels)) + # if scheme == self.scheme and \ + # self.port is not None and \ + # default_ports[scheme] != self.port: + # netloc += f':{self.port}' + + # path = '/'.join(('', *self.path_segments)) + # if self.has_trailing_slash: + # path += '/' + + # return f'{scheme}://{netloc}{path}' + +# URLs with those schemes will be recognized but not all of them have to be +# actually supported by Hydrilla server and Haketilo proxy. +supported_schemes = 'http', 'https', 'ftp', 'file' + +def _parse_pattern_or_url(url: str, orig_url: str, is_pattern: bool = False) \ + -> ParsedUrl: + """....""" + if not is_pattern: + assert orig_url == url + + parse_result = up.urlparse(url) + + # Verify the parsed URL is valid + has_hostname = parse_result.hostname is not None + if not parse_result.scheme or \ + (parse_result.scheme == 'file' and parse_result.port is not None) or \ + (parse_result.scheme == 'file' and has_hostname) or \ + (parse_result.scheme != 'file' and not has_hostname): + if is_pattern: + msg = _('err.url_pattern_{}.bad').format(orig_url) + raise HaketiloException(msg) + else: + raise HaketiloException(_('err.url_{}.bad') .format(url)) + + # Verify the URL uses a known scheme and extract it. + scheme = parse_result.scheme + + if parse_result.scheme not in supported_schemes: + if is_pattern: + msg = _('err.url_pattern_{}.bad_scheme').format(orig_url) + raise HaketiloException(msg) + else: + raise HaketiloException(_('err.url_{}.bad_scheme').format(url)) + + # Extract and keep information about special pattern schemas used. + if is_pattern and orig_url.startswith('http*:'): + if parse_result.port: + fmt = _('err.url_pattern_{}.special_scheme_port') + raise HaketiloException(fmt.format(orig_url)) + + # Extract URL's explicit port or deduce the port based on URL's protocol. + try: + explicit_port = parse_result.port + port_out_of_range = explicit_port == 0 + except ValueError: + port_out_of_range = True + + if port_out_of_range: + if is_pattern: + msg = _('err.url_pattern_{}.bad_port').format(orig_url) + raise HaketiloException(msg) + else: + raise HaketiloException(_('err.url_{}.bad_port').format(url)) + + port = t.cast(int, explicit_port or default_ports.get(parse_result.scheme)) + + # Make URL's hostname into a list of labels in reverse order. E.g. + # 'https://a.bc..de.fg.com/h/i/' -> ['com', 'fg', 'de', 'bc', 'a'] + hostname = parse_result.hostname or '' + domain_labels_with_empty = reversed(hostname.split('.')) + domain_labels = tuple(lbl for lbl in domain_labels_with_empty if lbl) + + # Make URL's path into a list of segments. E.g. + # 'https://ab.cd/e//f/g/' -> ['e', 'f', 'g'] + path_segments_with_empty = parse_result.path.split('/') + path_segments = tuple(sgmt for sgmt in path_segments_with_empty if sgmt) + + # Record whether a trailing '/' is present in the URL. + has_trailing_slash = parse_result.path.endswith('/') + + # Perform some additional sanity checks and return the result. + if is_pattern: + if parse_result.query: + msg = _('err.url_pattern_{}.has_query').format(orig_url) + raise HaketiloException(msg) + + if parse_result.fragment: + msg = _('err.url_pattern_{}.has_frag').format(orig_url) + raise HaketiloException(msg) + + return ParsedUrl( + orig_url = orig_url, + scheme = scheme, + port = port, + domain_labels = domain_labels, + path_segments = path_segments, + has_trailing_slash = has_trailing_slash + ) + +replace_scheme_regex = re.compile(r'^[^:]*') + +def parse_pattern(url_pattern: str) -> t.Sequence[ParsedUrl]: + """....""" + if url_pattern.startswith('http*:'): + patterns = [ + replace_scheme_regex.sub('http', url_pattern), + replace_scheme_regex.sub('https', url_pattern) + ] + else: + patterns = [url_pattern] + + return tuple(_parse_pattern_or_url(pat, url_pattern, True) + for pat in patterns) + +def parse_url(url: str) -> ParsedUrl: + """....""" + return _parse_pattern_or_url(url, url) -- cgit v1.2.3