summaryrefslogtreecommitdiff
path: root/src/hydrilla/server/serve.py
diff options
context:
space:
mode:
authorWojtek Kosior <koszko@koszko.org>2022-02-08 15:29:49 +0100
committerWojtek Kosior <koszko@koszko.org>2022-02-09 15:34:46 +0100
commit6676b4ed90e19e2fd6ee5f4242cf85f64db145d8 (patch)
tree42b45c6ed731abeab85e160b020bc57cab638fff /src/hydrilla/server/serve.py
parent67631e6c5db6739f7a57958d222e5af7ebc364b0 (diff)
downloadhaketilo-hydrilla-6676b4ed90e19e2fd6ee5f4242cf85f64db145d8.tar.gz
haketilo-hydrilla-6676b4ed90e19e2fd6ee5f4242cf85f64db145d8.zip
rework Hydrilla to use a separate tool for building its source packages
* Hydrilla now depends on "Hydrilla builder" developed at: https://git.koszko.org/hydrilla-builder/ * Hydrilla repository is now REUSE-compliant * The debian packaging is temporarily not tested and likely to be broken * JSON schemas are now in use (through 'jsonschema' Python library) * This is not yet a release and some minor changes to the API on-fisk format are going to occur before that
Diffstat (limited to 'src/hydrilla/server/serve.py')
-rw-r--r--src/hydrilla/server/serve.py604
1 files changed, 604 insertions, 0 deletions
diff --git a/src/hydrilla/server/serve.py b/src/hydrilla/server/serve.py
new file mode 100644
index 0000000..815ac63
--- /dev/null
+++ b/src/hydrilla/server/serve.py
@@ -0,0 +1,604 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+# Main repository logic.
+#
+# This file is part of Hydrilla
+#
+# Copyright (C) 2021, 2022 Wojtek Kosior
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#
+# I, Wojtek Kosior, thereby promise not to sue for violation of this
+# file's license. Although I request that you do not make use this code
+# in a proprietary program, I am not going to enforce this in court.
+
+import re
+import os
+import pathlib
+import json
+import gettext
+import logging
+
+from pathlib import Path
+from hashlib import sha256
+from abc import ABC, abstractmethod
+from typing import Optional, Union, Iterable
+
+from flask import Flask, Blueprint, current_app, url_for, abort, request, \
+ redirect, send_file
+from jinja2 import Environment, PackageLoader
+from werkzeug import Response
+
+from .. import util
+
+here = pathlib.Path(__file__).resolve().parent
+
+def load_config(config_path: Path) -> dict:
+ config = {}
+ to_load = [config_path]
+ failures_ok = [False]
+
+ while to_load:
+ path = to_load.pop()
+ can_fail = failures_ok.pop()
+
+ try:
+ json_text = util.strip_json_comments(config_path.read_text())
+ new_config = json.loads(json_text)
+ except Exception as e:
+ if can_fail:
+ continue
+ raise e from None
+
+ config.update(new_config)
+
+ for key, failure_ok in [('try_configs', True), ('use_configs', False)]:
+ paths = new_config.get(key, [])
+ paths.reverse()
+ to_load.extend(paths)
+ failures_ok.extend([failure_ok] * len(paths))
+
+ for key in ('try_configs', 'use_configs'):
+ if key in config:
+ config.pop(key)
+
+ for key in ('malcontent_dir', 'hydrilla_project_url'):
+ if key not in config:
+ raise ValueError(_('config_key_absent_{}').format(key))
+
+ malcontent_path = Path(config['malcontent_dir'])
+ if not malcontent_path.is_absolute():
+ malcontent_path = config_path.parent / malcontent_path
+
+ config['malcontent_dir'] = str(malcontent_path.resolve())
+
+ return config
+
+class ItemInfo(ABC):
+ """Shortened data of a resource/mapping."""
+ def __init__(self, item_obj: dict):
+ """Initialize ItemInfo using item definition read from JSON."""
+ self.version = util.normalize_version(item_obj['version'])
+ self.identifier = item_obj['identifier']
+ self.uuid = item_obj['uuid']
+ self.long_name = item_obj['long_name']
+
+ def path(self) -> str:
+ """
+ Get a relative path to this item's JSON definition with respect to
+ directory containing items of this type.
+ """
+ return f'{self.identifier}/{util.version_string(self.version)}'
+
+class ResourceInfo(ItemInfo):
+ """Shortened data of a resource."""
+ def __init__(self, resource_obj: dict):
+ """Initialize ResourceInfo using resource definition read from JSON."""
+ super().__init__(resource_obj)
+
+ self.dependencies = resource_obj.get('dependencies', [])
+
+class MappingInfo(ItemInfo):
+ """Shortened data of a mapping."""
+ def __init__(self, mapping_obj: dict):
+ """Initialize MappingInfo using mapping definition read from JSON."""
+ super().__init__(mapping_obj)
+
+ self.payloads = {}
+ for pattern, res_ref in mapping_obj.get('payloads', {}).items():
+ self.payloads[pattern] = res_ref['identifier']
+
+ def as_query_result(self) -> str:
+ """
+ Produce a json.dump()-able object describing this mapping as one of a
+ collection of query results.
+ """
+ return {
+ 'version': self.version,
+ 'identifier': self.identifier,
+ 'long_name': self.long_name
+ }
+
+class VersionedItemInfo:
+ """Stores data of multiple versions of given resource/mapping."""
+ def __init__(self):
+ self.uuid = None
+ self.identifier = None
+ self.by_version = {}
+ self.known_versions = []
+
+ def register(self, item_info: ItemInfo) -> None:
+ """
+ Make item info queryable by version. Perform sanity checks for uuid.
+ """
+ if self.identifier is None:
+ self.identifier = item_info.identifier
+ self.uuid = item_info.uuid
+ elif self.uuid != item_info.uuid:
+ raise ValueError(_('uuid_mismatch_{identifier}')
+ .format(identifier=self.identifier))
+
+ ver = item_info.version
+ ver_str = util.version_string(ver)
+
+ if ver_str in self.by_version:
+ raise ValueError(_('version_clash_{identifier}_{version}')
+ .format(identifier=self.identifier,
+ version=ver_str))
+
+ self.by_version[ver_str] = item_info
+ self.known_versions.append(ver)
+
+ def get_by_ver(self, ver: Optional[list[int]]=None) -> Optional[ItemInfo]:
+ """
+ Find and return info of the newest version of item.
+
+ If ver is specified, instead find and return info of that version of the
+ item (or None if absent).
+ """
+ ver = util.version_string(ver or self.known_versions[-1])
+
+ return self.by_version.get(ver)
+
+ def get_all(self) -> list[ItemInfo]:
+ """
+ Return a list of item info for all its versions, from oldest ot newest.
+ """
+ return [self.by_version[util.version_string(ver)]
+ for ver in self.known_versions]
+
+class PatternTreeNode:
+ """
+ "Pattern Tree" is how we refer to the data structure used for querying
+ Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
+ is to make it possible for given URL to quickly retrieve all known patterns
+ that match it.
+ """
+ def __init__(self):
+ self.wildcard_matches = [None, None, None]
+ self.literal_match = None
+ self.children = {}
+
+ def search(self, segments):
+ """
+ Yields all matches of this segments sequence against the tree that
+ starts at this node. Results are produces in order from greatest to
+ lowest pattern specificity.
+ """
+ nodes = [self]
+
+ for segment in segments:
+ next_node = nodes[-1].children.get(segment)
+ if next_node is None:
+ break
+
+ nodes.append(next_node)
+
+ nsegments = len(segments)
+ cond_literal = lambda: len(nodes) == nsegments
+ cond_wildcard = [
+ lambda: len(nodes) + 1 == nsegments and segments[-1] != '*',
+ lambda: len(nodes) + 1 < nsegments,
+ lambda: len(nodes) + 1 != nsegments or segments[-1] != '***'
+ ]
+
+ while nodes:
+ node = nodes.pop()
+
+ for item, condition in [(node.literal_match, cond_literal),
+ *zip(node.wildcard_matches, cond_wildcard)]:
+ if item is not None and condition():
+ yield item
+
+ def add(self, segments, item_instantiator):
+ """
+ Make item queryable through (this branch of) the Pattern Tree. If there
+ was not yet any item associated with the tree path designated by
+ segments, create a new one using item_instantiator() function. Return
+ all items matching this path (both the ones that existed and the ones
+ just created).
+ """
+ node = self
+ segment = None
+
+ for segment in segments:
+ wildcards = node.wildcard_matches
+
+ child = node.children.get(segment) or PatternTreeNode()
+ node.children[segment] = child
+ node = child
+
+ if node.literal_match is None:
+ node.literal_match = item_instantiator()
+
+ if segment not in ('*', '**', '***'):
+ return [node.literal_match]
+
+ if wildcards[len(segment) - 1] is None:
+ wildcards[len(segment) - 1] = item_instantiator()
+
+ return [node.literal_match, wildcards[len(segment) - 1]]
+
+proto_regex = re.compile(r'^(?P<proto>\w+)://(?P<rest>.*)$')
+user_re = r'[^/?#@]+@' # r'(?P<user>[^/?#@]+)@' # discarded for now
+query_re = r'\??[^#]*' # r'\??(?P<query>[^#]*)' # discarded for now
+domain_re = r'(?P<domain>[^/?#]+)'
+path_re = r'(?P<path>[^?#]*)'
+http_regex = re.compile(f'{domain_re}{path_re}{query_re}.*')
+ftp_regex = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*')
+
+class UrlError(ValueError):
+ """Used to report a URL or URL pattern that is invalid or unsupported."""
+ pass
+
+class DeconstructedUrl:
+ """Represents a deconstructed URL or URL pattern"""
+ def __init__(self, url):
+ self.url = url
+
+ match = proto_regex.match(url)
+ if not match:
+ raise UrlError(_('invalid_URL_{}').format(url))
+
+ self.proto = match.group('proto')
+ if self.proto not in ('http', 'https', 'ftp'):
+ raise UrlError(_('disallowed_protocol_{}').format(proto))
+
+ if self.proto == 'ftp':
+ match = ftp_regex.match(match.group('rest'))
+ elif self.proto in ('http', 'https'):
+ match = http_regex.match(match.group('rest'))
+
+ if not match:
+ raise UrlError(_('invalid_URL_{}').format(url))
+
+ self.domain = match.group('domain').split('.')
+ self.domain.reverse()
+ self.path = [*filter(None, match.group('path').split('/'))]
+
+class PatternMapping:
+ """
+ A mapping info, together with one of its patterns, as stored in Pattern
+ Tree.
+ """
+ def __init__(self, pattern: str, mapping_info: MappingInfo):
+ self.pattern = pattern
+ self.mapping_info = mapping_info
+
+ def register(self, pattern_tree: dict):
+ """
+ Make self queryable through the Pattern Tree passed in the argument.
+ """
+ deco = DeconstructedUrl(self.pattern)
+
+ domain_tree = pattern_tree.get(deco.proto) or PatternTreeNode()
+ pattern_tree[deco.proto] = domain_tree
+
+ for path_tree in domain_tree.add(deco.domain, PatternTreeNode):
+ for match_list in path_tree.add(deco.path, list):
+ match_list.append(self)
+
+class Malcontent:
+ """
+ Instance of this class represents a directory with files that can be loaded
+ and served by Hydrilla.
+ """
+ def __init__(self, malcontent_dir_path: Union[Path, str]):
+ """
+ When an instance of Malcontent is constructed, it searches
+ malcontent_dir_path for serveable site-modifying packages and loads
+ them into its data structures.
+ """
+ self.infos = {'resource': {}, 'mapping': {}}
+ self.pattern_tree = {}
+
+ self.malcontent_dir_path = pathlib.Path(malcontent_dir_path).resolve()
+
+ if not self.malcontent_dir_path.is_dir():
+ raise ValueError(_('malcontent_dir_path_not_dir'))
+
+ for item_type in ('mapping', 'resource'):
+ type_path = self.malcontent_dir_path / item_type
+ if not type_path.is_dir():
+ continue
+
+ for subpath in type_path.iterdir():
+ if not subpath.is_dir():
+ continue
+
+ for ver_file in subpath.iterdir():
+ try:
+ self._load_item(item_type, ver_file)
+ except Exception as e:
+ if current_app._hydrilla_werror:
+ raise e from None
+
+ msg = _('couldnt_load_item_from_{}').format(ver_file)
+ logging.error(msg, exc_info=True)
+
+ self._report_missing()
+ self._finalize()
+
+ def _load_item(self, item_type: str, ver_file: Path) -> None:
+ """
+ Reads, validates and autocompletes serveable mapping/resource
+ definition, then registers information from it in data structures.
+ """
+ version = util.parse_version(ver_file.name)
+ identifier = ver_file.parent.name
+
+ with open(ver_file, 'rt') as file_handle:
+ item_json = json.load(file_handle)
+
+ util.validator_for(f'api_{item_type}_description-1.schema.json')\
+ .validate(item_json)
+
+ if item_type == 'resource':
+ item_info = ResourceInfo(item_json)
+ else:
+ item_info = MappingInfo(item_json)
+
+ if item_info.identifier != identifier:
+ msg = _('item_{item}_in_file_{file}')\
+ .format({'item': item_info.identifier, 'file': ver_file})
+ raise ValueError(msg)
+
+ if item_info.version != version:
+ ver_str = util.version_string(item_info.version)
+ msg = _('item_version_{ver}_in_file_{file}')\
+ .format({'ver': ver_str, 'file': ver_file})
+ raise ValueError(msg)
+
+ versioned_info = self.infos[item_type].get(identifier)
+ if versioned_info is None:
+ versioned_info = VersionedItemInfo()
+ self.infos[item_type][identifier] = versioned_info
+
+ versioned_info.register(item_info)
+
+ def _all_of_type(self, item_type: str) -> Iterable[ItemInfo]:
+ """Iterator over all registered versions of all mappings/resources."""
+ for versioned_info in self.infos[item_type].values():
+ for item_info in versioned_info.by_version.values():
+ yield item_info
+
+ def _report_missing(self) -> None:
+ """
+ Use logger to print information about items that are referenced but
+ were not loaded.
+ """
+ def report_missing_dependency(info: ResourceInfo, dep: str) -> None:
+ msg = _('no_dep_%(resource)s_%(ver)s_%(dep)s')\
+ .format(dep=dep, resource=info.identifier,
+ ver=util.version_string(info.version))
+ logging.error(msg)
+
+ for resource_info in self._all_of_type('resource'):
+ for dep in resource_info.dependencies:
+ if dep not in self.infos['resource']:
+ report_missing_dependency(resource_info, dep)
+
+ def report_missing_payload(info: MappingInfo, payload: str) -> None:
+ msg = _('no_payload_{mapping}_{ver}_{payload}')\
+ .format(mapping=info.identifier, payload=payload,
+ ver=util.version_string(info.version))
+ logging.error(msg)
+
+ for mapping_info in self._all_of_type('mapping'):
+ for payload in mapping_info.payloads.values():
+ if payload not in self.infos['resource']:
+ report_missing_payload(mapping_info, payload)
+
+ def _finalize(self):
+ """
+ Initialize structures needed to serve queries. Called once after all
+ data gets loaded.
+ """
+ for infos_dict in self.infos.values():
+ for versioned_info in infos_dict.values():
+ versioned_info.known_versions.sort()
+
+ for info in self._all_of_type('mapping'):
+ for pattern in info.payloads:
+ try:
+ PatternMapping(pattern, info).register(self.pattern_tree)
+ except Exception as e:
+ if current_app._hydrilla_werror:
+ raise e from None
+ msg = _('couldnt_register_{mapping}_{ver}_{pattern}')\
+ .format(mapping=info.identifier, pattern=pattern,
+ ver=util.version_string(info.version))
+ logging.error(msg)
+
+ def query(self, url: str) -> list[MappingInfo]:
+ """
+ Return a list of registered mappings that match url.
+
+ If multiple versions of a mapping are applicable, only the most recent
+ is included in the result.
+ """
+ deco = DeconstructedUrl(url)
+
+ collected = {}
+
+ domain_tree = self.pattern_tree.get(deco.proto) or PatternTreeNode()
+
+ def process_mapping(pattern_mapping: PatternMapping) -> None:
+ if url[-1] != '/' and pattern_mapping.pattern[-1] == '/':
+ return
+
+ info = pattern_mapping.mapping_info
+
+ if info.identifier not in collected or \
+ info.version > collected[info.identifier].version:
+ collected[info.identifier] = info
+
+ for path_tree in domain_tree.search(deco.domain):
+ for matches_list in path_tree.search(deco.path):
+ for pattern_mapping in matches_list:
+ process_mapping(pattern_mapping)
+
+ return list(collected.values())
+
+bp = Blueprint('bp', __package__)
+
+def create_app(config_path: Path=(here / 'config.json'), flask_config: dict={}):
+ """Create the Flask instance."""
+ config = load_config(config_path)
+
+ app = Flask(__package__, static_url_path='/',
+ static_folder=config['malcontent_dir'])
+ app.config.update(flask_config)
+
+ language = flask_config.get('lang', 'en')
+ translation = gettext.translation('hydrilla', localedir=(here / 'locales'),
+ languages=[language])
+
+ app._hydrilla_gettext = translation.gettext
+
+ # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode
+ if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true':
+ return app
+
+ app._hydrilla_project_url = config['hydrilla_project_url']
+ app._hydrilla_werror = config.get('werror', False)
+ if 'hydrilla_parent' in config:
+ raise MyNotImplError('hydrilla_parent', config_path.name)
+
+ malcontent_dir = pathlib.Path(config['malcontent_dir'])
+ if not malcontent_dir.is_absolute():
+ malcontent_dir = config_path.parent / malcontent_dir
+ with app.app_context():
+ app._hydrilla_malcontent = Malcontent(malcontent_dir.resolve())
+
+ app.register_blueprint(bp)
+
+ return app
+
+def _(text_key):
+ return current_app._hydrilla_gettext(text_key)
+
+def malcontent():
+ return current_app._hydrilla_malcontent
+
+# TODO: override create_jinja_environment() method of Flask instead of wrapping
+# Jinja environment
+class MyEnvironment(Environment):
+ """
+ A wrapper class around jinja2.Environment that causes GNU gettext function
+ (as '_' and '__'), url_for function and 'hydrilla_project_url' config option
+ to be passed to every call of each template's render() method.
+ """
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def get_template(self, *args, **kwargs):
+ template = super().get_template(*args, **kwargs)
+ old_render = template.render
+
+ def new_render(*args, **kwargs):
+ _ = current_app._hydrilla_gettext
+ project_url = current_app._hydrilla_project_url
+
+ def escaping_gettext(text_key):
+ from markupsafe import escape
+
+ return str(escape(_(text_key)))
+
+ final_kwargs = {
+ '_': escaping_gettext,
+ '__': escaping_gettext,
+ 'url_for': url_for,
+ 'hydrilla_project_url' : project_url
+ }
+ final_kwargs.update(kwargs)
+
+ return old_render(*args, **final_kwargs)
+
+ template.render = new_render
+
+ return template
+
+j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False)
+
+indexpage = j2env.get_template('index.html')
+@bp.route('/')
+def index():
+ return indexpage.render()
+
+identifier_json_re = re.compile(r'^([-0-9a-z.]+)\.json$')
+
+def get_resource_or_mapping(item_type: str, identifier: str) -> Response:
+ """
+ Strip '.json' from 'identifier', look the item up and send its JSON
+ description.
+ """
+ match = identifier_json_re.match(identifier)
+ if not match:
+ abort(404)
+
+ identifier = match.group(1)
+
+ versioned_info = malcontent().infos[item_type].get(identifier)
+
+ info = versioned_info and versioned_info.get_by_ver()
+ if info is None:
+ abort(404)
+
+ # no need for send_from_directory(); path is safe, constructed by us
+ return send_file(malcontent().malcontent_dir_path / item_type / info.path())
+
+@bp.route('/mapping/<string:identifier_dot_json>')
+def get_newest_mapping(identifier_dot_json: str) -> Response:
+ return get_resource_or_mapping('mapping', identifier_dot_json)
+
+@bp.route('/resource/<string:identifier_dot_json>')
+def get_newest_resource(identifier_dot_json: str) -> Response:
+ return get_resource_or_mapping('resource', identifier_dot_json)
+
+@bp.route('/query')
+def query():
+ url = request.args['url']
+
+ mapping_refs = [i.as_query_result() for i in malcontent().query(url)]
+ result = {
+ 'api_schema_version': [1],
+ 'generated_by': {
+ 'name': 'hydrilla'
+ },
+ 'mappings': mapping_refs
+ }
+
+ return json.dumps(result)