aboutsummaryrefslogtreecommitdiff
path: root/src/hydrilla/url_patterns.py
diff options
context:
space:
mode:
authorWojtek Kosior <koszko@koszko.org>2022-06-13 11:06:49 +0200
committerWojtek Kosior <koszko@koszko.org>2022-07-16 16:31:44 +0200
commit52d12a4fa124daa1595529e3e7008276a7986d95 (patch)
tree9b56fe2d28ff0242f8511aca570be455112ad3df /src/hydrilla/url_patterns.py
parent9dcbfdfe8620cc417438d1727aa1e0c89846e9bf (diff)
downloadhaketilo-hydrilla-52d12a4fa124daa1595529e3e7008276a7986d95.tar.gz
haketilo-hydrilla-52d12a4fa124daa1595529e3e7008276a7986d95.zip
unfinished partial work
Diffstat (limited to 'src/hydrilla/url_patterns.py')
-rw-r--r--src/hydrilla/url_patterns.py181
1 files changed, 181 insertions, 0 deletions
diff --git a/src/hydrilla/url_patterns.py b/src/hydrilla/url_patterns.py
new file mode 100644
index 0000000..8e80379
--- /dev/null
+++ b/src/hydrilla/url_patterns.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# Data structure for querying URL patterns.
+#
+# This file is part of Hydrilla&Haketilo.
+#
+# Copyright (C) 2021, 2022 Wojtek Kosior
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#
+# I, Wojtek Kosior, thereby promise not to sue for violation of this
+# file's license. Although I request that you do not make use this code
+# in a proprietary program, I am not going to enforce this in court.
+
+"""
+This module contains functions for deconstruction and construction of URLs and
+Haketilo URL patterns.
+
+Data structures for querying data using URL patterns are also defined there.
+"""
+
+# Enable using with Python 3.7.
+from __future__ import annotations
+
+import re
+import urllib.parse as up
+import typing as t
+import dataclasses as dc
+
+from immutables import Map
+
+from hydrilla.translations import smart_gettext as _
+from hydrilla.exceptions import HaketiloException
+
+default_ports: t.Mapping[str, int] = Map(http=80, https=443, ftp=21)
+
+@dc.dataclass(frozen=True, unsafe_hash=True)
+class ParsedUrl:
+ """...."""
+ orig_url: str # orig_url used in __hash__()
+ scheme: str = dc.field(hash=False)
+ domain_labels: tuple[str, ...] = dc.field(hash=False)
+ path_segments: tuple[str, ...] = dc.field(hash=False)
+ has_trailing_slash: bool = dc.field(hash=False)
+ port: int = dc.field(hash=False)
+
+ # def reconstruct_url(self) -> str:
+ # """...."""
+ # scheme = self.orig_scheme
+
+ # netloc = '.'.join(reversed(self.domain_labels))
+ # if scheme == self.scheme and \
+ # self.port is not None and \
+ # default_ports[scheme] != self.port:
+ # netloc += f':{self.port}'
+
+ # path = '/'.join(('', *self.path_segments))
+ # if self.has_trailing_slash:
+ # path += '/'
+
+ # return f'{scheme}://{netloc}{path}'
+
+# URLs with those schemes will be recognized but not all of them have to be
+# actually supported by Hydrilla server and Haketilo proxy.
+supported_schemes = 'http', 'https', 'ftp', 'file'
+
+def _parse_pattern_or_url(url: str, orig_url: str, is_pattern: bool = False) \
+ -> ParsedUrl:
+ """...."""
+ if not is_pattern:
+ assert orig_url == url
+
+ parse_result = up.urlparse(url)
+
+ # Verify the parsed URL is valid
+ has_hostname = parse_result.hostname is not None
+ if not parse_result.scheme or \
+ (parse_result.scheme == 'file' and parse_result.port is not None) or \
+ (parse_result.scheme == 'file' and has_hostname) or \
+ (parse_result.scheme != 'file' and not has_hostname):
+ if is_pattern:
+ msg = _('err.url_pattern_{}.bad').format(orig_url)
+ raise HaketiloException(msg)
+ else:
+ raise HaketiloException(_('err.url_{}.bad') .format(url))
+
+ # Verify the URL uses a known scheme and extract it.
+ scheme = parse_result.scheme
+
+ if parse_result.scheme not in supported_schemes:
+ if is_pattern:
+ msg = _('err.url_pattern_{}.bad_scheme').format(orig_url)
+ raise HaketiloException(msg)
+ else:
+ raise HaketiloException(_('err.url_{}.bad_scheme').format(url))
+
+ # Extract and keep information about special pattern schemas used.
+ if is_pattern and orig_url.startswith('http*:'):
+ if parse_result.port:
+ fmt = _('err.url_pattern_{}.special_scheme_port')
+ raise HaketiloException(fmt.format(orig_url))
+
+ # Extract URL's explicit port or deduce the port based on URL's protocol.
+ try:
+ explicit_port = parse_result.port
+ port_out_of_range = explicit_port == 0
+ except ValueError:
+ port_out_of_range = True
+
+ if port_out_of_range:
+ if is_pattern:
+ msg = _('err.url_pattern_{}.bad_port').format(orig_url)
+ raise HaketiloException(msg)
+ else:
+ raise HaketiloException(_('err.url_{}.bad_port').format(url))
+
+ port = t.cast(int, explicit_port or default_ports.get(parse_result.scheme))
+
+ # Make URL's hostname into a list of labels in reverse order. E.g.
+ # 'https://a.bc..de.fg.com/h/i/' -> ['com', 'fg', 'de', 'bc', 'a']
+ hostname = parse_result.hostname or ''
+ domain_labels_with_empty = reversed(hostname.split('.'))
+ domain_labels = tuple(lbl for lbl in domain_labels_with_empty if lbl)
+
+ # Make URL's path into a list of segments. E.g.
+ # 'https://ab.cd/e//f/g/' -> ['e', 'f', 'g']
+ path_segments_with_empty = parse_result.path.split('/')
+ path_segments = tuple(sgmt for sgmt in path_segments_with_empty if sgmt)
+
+ # Record whether a trailing '/' is present in the URL.
+ has_trailing_slash = parse_result.path.endswith('/')
+
+ # Perform some additional sanity checks and return the result.
+ if is_pattern:
+ if parse_result.query:
+ msg = _('err.url_pattern_{}.has_query').format(orig_url)
+ raise HaketiloException(msg)
+
+ if parse_result.fragment:
+ msg = _('err.url_pattern_{}.has_frag').format(orig_url)
+ raise HaketiloException(msg)
+
+ return ParsedUrl(
+ orig_url = orig_url,
+ scheme = scheme,
+ port = port,
+ domain_labels = domain_labels,
+ path_segments = path_segments,
+ has_trailing_slash = has_trailing_slash
+ )
+
+replace_scheme_regex = re.compile(r'^[^:]*')
+
+def parse_pattern(url_pattern: str) -> t.Sequence[ParsedUrl]:
+ """...."""
+ if url_pattern.startswith('http*:'):
+ patterns = [
+ replace_scheme_regex.sub('http', url_pattern),
+ replace_scheme_regex.sub('https', url_pattern)
+ ]
+ else:
+ patterns = [url_pattern]
+
+ return tuple(_parse_pattern_or_url(pat, url_pattern, True)
+ for pat in patterns)
+
+def parse_url(url: str) -> ParsedUrl:
+ """...."""
+ return _parse_pattern_or_url(url, url)