From 6676b4ed90e19e2fd6ee5f4242cf85f64db145d8 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Tue, 8 Feb 2022 15:29:49 +0100 Subject: rework Hydrilla to use a separate tool for building its source packages * Hydrilla now depends on "Hydrilla builder" developed at: https://git.koszko.org/hydrilla-builder/ * Hydrilla repository is now REUSE-compliant * The debian packaging is temporarily not tested and likely to be broken * JSON schemas are now in use (through 'jsonschema' Python library) * This is not yet a release and some minor changes to the API on-fisk format are going to occur before that --- src/hydrilla/server/serve.py | 604 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100644 src/hydrilla/server/serve.py (limited to 'src/hydrilla/server/serve.py') diff --git a/src/hydrilla/server/serve.py b/src/hydrilla/server/serve.py new file mode 100644 index 0000000..815ac63 --- /dev/null +++ b/src/hydrilla/server/serve.py @@ -0,0 +1,604 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +# Main repository logic. +# +# This file is part of Hydrilla +# +# Copyright (C) 2021, 2022 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +import re +import os +import pathlib +import json +import gettext +import logging + +from pathlib import Path +from hashlib import sha256 +from abc import ABC, abstractmethod +from typing import Optional, Union, Iterable + +from flask import Flask, Blueprint, current_app, url_for, abort, request, \ + redirect, send_file +from jinja2 import Environment, PackageLoader +from werkzeug import Response + +from .. import util + +here = pathlib.Path(__file__).resolve().parent + +def load_config(config_path: Path) -> dict: + config = {} + to_load = [config_path] + failures_ok = [False] + + while to_load: + path = to_load.pop() + can_fail = failures_ok.pop() + + try: + json_text = util.strip_json_comments(config_path.read_text()) + new_config = json.loads(json_text) + except Exception as e: + if can_fail: + continue + raise e from None + + config.update(new_config) + + for key, failure_ok in [('try_configs', True), ('use_configs', False)]: + paths = new_config.get(key, []) + paths.reverse() + to_load.extend(paths) + failures_ok.extend([failure_ok] * len(paths)) + + for key in ('try_configs', 'use_configs'): + if key in config: + config.pop(key) + + for key in ('malcontent_dir', 'hydrilla_project_url'): + if key not in config: + raise ValueError(_('config_key_absent_{}').format(key)) + + malcontent_path = Path(config['malcontent_dir']) + if not malcontent_path.is_absolute(): + malcontent_path = config_path.parent / malcontent_path + + config['malcontent_dir'] = str(malcontent_path.resolve()) + + return config + +class ItemInfo(ABC): + """Shortened data of a resource/mapping.""" + def __init__(self, item_obj: dict): + """Initialize ItemInfo using item definition read from JSON.""" + self.version = util.normalize_version(item_obj['version']) + self.identifier = item_obj['identifier'] + self.uuid = item_obj['uuid'] + self.long_name = item_obj['long_name'] + + def path(self) -> str: + """ + Get a relative path to this item's JSON definition with respect to + directory containing items of this type. + """ + return f'{self.identifier}/{util.version_string(self.version)}' + +class ResourceInfo(ItemInfo): + """Shortened data of a resource.""" + def __init__(self, resource_obj: dict): + """Initialize ResourceInfo using resource definition read from JSON.""" + super().__init__(resource_obj) + + self.dependencies = resource_obj.get('dependencies', []) + +class MappingInfo(ItemInfo): + """Shortened data of a mapping.""" + def __init__(self, mapping_obj: dict): + """Initialize MappingInfo using mapping definition read from JSON.""" + super().__init__(mapping_obj) + + self.payloads = {} + for pattern, res_ref in mapping_obj.get('payloads', {}).items(): + self.payloads[pattern] = res_ref['identifier'] + + def as_query_result(self) -> str: + """ + Produce a json.dump()-able object describing this mapping as one of a + collection of query results. + """ + return { + 'version': self.version, + 'identifier': self.identifier, + 'long_name': self.long_name + } + +class VersionedItemInfo: + """Stores data of multiple versions of given resource/mapping.""" + def __init__(self): + self.uuid = None + self.identifier = None + self.by_version = {} + self.known_versions = [] + + def register(self, item_info: ItemInfo) -> None: + """ + Make item info queryable by version. Perform sanity checks for uuid. + """ + if self.identifier is None: + self.identifier = item_info.identifier + self.uuid = item_info.uuid + elif self.uuid != item_info.uuid: + raise ValueError(_('uuid_mismatch_{identifier}') + .format(identifier=self.identifier)) + + ver = item_info.version + ver_str = util.version_string(ver) + + if ver_str in self.by_version: + raise ValueError(_('version_clash_{identifier}_{version}') + .format(identifier=self.identifier, + version=ver_str)) + + self.by_version[ver_str] = item_info + self.known_versions.append(ver) + + def get_by_ver(self, ver: Optional[list[int]]=None) -> Optional[ItemInfo]: + """ + Find and return info of the newest version of item. + + If ver is specified, instead find and return info of that version of the + item (or None if absent). + """ + ver = util.version_string(ver or self.known_versions[-1]) + + return self.by_version.get(ver) + + def get_all(self) -> list[ItemInfo]: + """ + Return a list of item info for all its versions, from oldest ot newest. + """ + return [self.by_version[util.version_string(ver)] + for ver in self.known_versions] + +class PatternTreeNode: + """ + "Pattern Tree" is how we refer to the data structure used for querying + Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal + is to make it possible for given URL to quickly retrieve all known patterns + that match it. + """ + def __init__(self): + self.wildcard_matches = [None, None, None] + self.literal_match = None + self.children = {} + + def search(self, segments): + """ + Yields all matches of this segments sequence against the tree that + starts at this node. Results are produces in order from greatest to + lowest pattern specificity. + """ + nodes = [self] + + for segment in segments: + next_node = nodes[-1].children.get(segment) + if next_node is None: + break + + nodes.append(next_node) + + nsegments = len(segments) + cond_literal = lambda: len(nodes) == nsegments + cond_wildcard = [ + lambda: len(nodes) + 1 == nsegments and segments[-1] != '*', + lambda: len(nodes) + 1 < nsegments, + lambda: len(nodes) + 1 != nsegments or segments[-1] != '***' + ] + + while nodes: + node = nodes.pop() + + for item, condition in [(node.literal_match, cond_literal), + *zip(node.wildcard_matches, cond_wildcard)]: + if item is not None and condition(): + yield item + + def add(self, segments, item_instantiator): + """ + Make item queryable through (this branch of) the Pattern Tree. If there + was not yet any item associated with the tree path designated by + segments, create a new one using item_instantiator() function. Return + all items matching this path (both the ones that existed and the ones + just created). + """ + node = self + segment = None + + for segment in segments: + wildcards = node.wildcard_matches + + child = node.children.get(segment) or PatternTreeNode() + node.children[segment] = child + node = child + + if node.literal_match is None: + node.literal_match = item_instantiator() + + if segment not in ('*', '**', '***'): + return [node.literal_match] + + if wildcards[len(segment) - 1] is None: + wildcards[len(segment) - 1] = item_instantiator() + + return [node.literal_match, wildcards[len(segment) - 1]] + +proto_regex = re.compile(r'^(?P\w+)://(?P.*)$') +user_re = r'[^/?#@]+@' # r'(?P[^/?#@]+)@' # discarded for now +query_re = r'\??[^#]*' # r'\??(?P[^#]*)' # discarded for now +domain_re = r'(?P[^/?#]+)' +path_re = r'(?P[^?#]*)' +http_regex = re.compile(f'{domain_re}{path_re}{query_re}.*') +ftp_regex = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*') + +class UrlError(ValueError): + """Used to report a URL or URL pattern that is invalid or unsupported.""" + pass + +class DeconstructedUrl: + """Represents a deconstructed URL or URL pattern""" + def __init__(self, url): + self.url = url + + match = proto_regex.match(url) + if not match: + raise UrlError(_('invalid_URL_{}').format(url)) + + self.proto = match.group('proto') + if self.proto not in ('http', 'https', 'ftp'): + raise UrlError(_('disallowed_protocol_{}').format(proto)) + + if self.proto == 'ftp': + match = ftp_regex.match(match.group('rest')) + elif self.proto in ('http', 'https'): + match = http_regex.match(match.group('rest')) + + if not match: + raise UrlError(_('invalid_URL_{}').format(url)) + + self.domain = match.group('domain').split('.') + self.domain.reverse() + self.path = [*filter(None, match.group('path').split('/'))] + +class PatternMapping: + """ + A mapping info, together with one of its patterns, as stored in Pattern + Tree. + """ + def __init__(self, pattern: str, mapping_info: MappingInfo): + self.pattern = pattern + self.mapping_info = mapping_info + + def register(self, pattern_tree: dict): + """ + Make self queryable through the Pattern Tree passed in the argument. + """ + deco = DeconstructedUrl(self.pattern) + + domain_tree = pattern_tree.get(deco.proto) or PatternTreeNode() + pattern_tree[deco.proto] = domain_tree + + for path_tree in domain_tree.add(deco.domain, PatternTreeNode): + for match_list in path_tree.add(deco.path, list): + match_list.append(self) + +class Malcontent: + """ + Instance of this class represents a directory with files that can be loaded + and served by Hydrilla. + """ + def __init__(self, malcontent_dir_path: Union[Path, str]): + """ + When an instance of Malcontent is constructed, it searches + malcontent_dir_path for serveable site-modifying packages and loads + them into its data structures. + """ + self.infos = {'resource': {}, 'mapping': {}} + self.pattern_tree = {} + + self.malcontent_dir_path = pathlib.Path(malcontent_dir_path).resolve() + + if not self.malcontent_dir_path.is_dir(): + raise ValueError(_('malcontent_dir_path_not_dir')) + + for item_type in ('mapping', 'resource'): + type_path = self.malcontent_dir_path / item_type + if not type_path.is_dir(): + continue + + for subpath in type_path.iterdir(): + if not subpath.is_dir(): + continue + + for ver_file in subpath.iterdir(): + try: + self._load_item(item_type, ver_file) + except Exception as e: + if current_app._hydrilla_werror: + raise e from None + + msg = _('couldnt_load_item_from_{}').format(ver_file) + logging.error(msg, exc_info=True) + + self._report_missing() + self._finalize() + + def _load_item(self, item_type: str, ver_file: Path) -> None: + """ + Reads, validates and autocompletes serveable mapping/resource + definition, then registers information from it in data structures. + """ + version = util.parse_version(ver_file.name) + identifier = ver_file.parent.name + + with open(ver_file, 'rt') as file_handle: + item_json = json.load(file_handle) + + util.validator_for(f'api_{item_type}_description-1.schema.json')\ + .validate(item_json) + + if item_type == 'resource': + item_info = ResourceInfo(item_json) + else: + item_info = MappingInfo(item_json) + + if item_info.identifier != identifier: + msg = _('item_{item}_in_file_{file}')\ + .format({'item': item_info.identifier, 'file': ver_file}) + raise ValueError(msg) + + if item_info.version != version: + ver_str = util.version_string(item_info.version) + msg = _('item_version_{ver}_in_file_{file}')\ + .format({'ver': ver_str, 'file': ver_file}) + raise ValueError(msg) + + versioned_info = self.infos[item_type].get(identifier) + if versioned_info is None: + versioned_info = VersionedItemInfo() + self.infos[item_type][identifier] = versioned_info + + versioned_info.register(item_info) + + def _all_of_type(self, item_type: str) -> Iterable[ItemInfo]: + """Iterator over all registered versions of all mappings/resources.""" + for versioned_info in self.infos[item_type].values(): + for item_info in versioned_info.by_version.values(): + yield item_info + + def _report_missing(self) -> None: + """ + Use logger to print information about items that are referenced but + were not loaded. + """ + def report_missing_dependency(info: ResourceInfo, dep: str) -> None: + msg = _('no_dep_%(resource)s_%(ver)s_%(dep)s')\ + .format(dep=dep, resource=info.identifier, + ver=util.version_string(info.version)) + logging.error(msg) + + for resource_info in self._all_of_type('resource'): + for dep in resource_info.dependencies: + if dep not in self.infos['resource']: + report_missing_dependency(resource_info, dep) + + def report_missing_payload(info: MappingInfo, payload: str) -> None: + msg = _('no_payload_{mapping}_{ver}_{payload}')\ + .format(mapping=info.identifier, payload=payload, + ver=util.version_string(info.version)) + logging.error(msg) + + for mapping_info in self._all_of_type('mapping'): + for payload in mapping_info.payloads.values(): + if payload not in self.infos['resource']: + report_missing_payload(mapping_info, payload) + + def _finalize(self): + """ + Initialize structures needed to serve queries. Called once after all + data gets loaded. + """ + for infos_dict in self.infos.values(): + for versioned_info in infos_dict.values(): + versioned_info.known_versions.sort() + + for info in self._all_of_type('mapping'): + for pattern in info.payloads: + try: + PatternMapping(pattern, info).register(self.pattern_tree) + except Exception as e: + if current_app._hydrilla_werror: + raise e from None + msg = _('couldnt_register_{mapping}_{ver}_{pattern}')\ + .format(mapping=info.identifier, pattern=pattern, + ver=util.version_string(info.version)) + logging.error(msg) + + def query(self, url: str) -> list[MappingInfo]: + """ + Return a list of registered mappings that match url. + + If multiple versions of a mapping are applicable, only the most recent + is included in the result. + """ + deco = DeconstructedUrl(url) + + collected = {} + + domain_tree = self.pattern_tree.get(deco.proto) or PatternTreeNode() + + def process_mapping(pattern_mapping: PatternMapping) -> None: + if url[-1] != '/' and pattern_mapping.pattern[-1] == '/': + return + + info = pattern_mapping.mapping_info + + if info.identifier not in collected or \ + info.version > collected[info.identifier].version: + collected[info.identifier] = info + + for path_tree in domain_tree.search(deco.domain): + for matches_list in path_tree.search(deco.path): + for pattern_mapping in matches_list: + process_mapping(pattern_mapping) + + return list(collected.values()) + +bp = Blueprint('bp', __package__) + +def create_app(config_path: Path=(here / 'config.json'), flask_config: dict={}): + """Create the Flask instance.""" + config = load_config(config_path) + + app = Flask(__package__, static_url_path='/', + static_folder=config['malcontent_dir']) + app.config.update(flask_config) + + language = flask_config.get('lang', 'en') + translation = gettext.translation('hydrilla', localedir=(here / 'locales'), + languages=[language]) + + app._hydrilla_gettext = translation.gettext + + # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode + if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true': + return app + + app._hydrilla_project_url = config['hydrilla_project_url'] + app._hydrilla_werror = config.get('werror', False) + if 'hydrilla_parent' in config: + raise MyNotImplError('hydrilla_parent', config_path.name) + + malcontent_dir = pathlib.Path(config['malcontent_dir']) + if not malcontent_dir.is_absolute(): + malcontent_dir = config_path.parent / malcontent_dir + with app.app_context(): + app._hydrilla_malcontent = Malcontent(malcontent_dir.resolve()) + + app.register_blueprint(bp) + + return app + +def _(text_key): + return current_app._hydrilla_gettext(text_key) + +def malcontent(): + return current_app._hydrilla_malcontent + +# TODO: override create_jinja_environment() method of Flask instead of wrapping +# Jinja environment +class MyEnvironment(Environment): + """ + A wrapper class around jinja2.Environment that causes GNU gettext function + (as '_' and '__'), url_for function and 'hydrilla_project_url' config option + to be passed to every call of each template's render() method. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_template(self, *args, **kwargs): + template = super().get_template(*args, **kwargs) + old_render = template.render + + def new_render(*args, **kwargs): + _ = current_app._hydrilla_gettext + project_url = current_app._hydrilla_project_url + + def escaping_gettext(text_key): + from markupsafe import escape + + return str(escape(_(text_key))) + + final_kwargs = { + '_': escaping_gettext, + '__': escaping_gettext, + 'url_for': url_for, + 'hydrilla_project_url' : project_url + } + final_kwargs.update(kwargs) + + return old_render(*args, **final_kwargs) + + template.render = new_render + + return template + +j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False) + +indexpage = j2env.get_template('index.html') +@bp.route('/') +def index(): + return indexpage.render() + +identifier_json_re = re.compile(r'^([-0-9a-z.]+)\.json$') + +def get_resource_or_mapping(item_type: str, identifier: str) -> Response: + """ + Strip '.json' from 'identifier', look the item up and send its JSON + description. + """ + match = identifier_json_re.match(identifier) + if not match: + abort(404) + + identifier = match.group(1) + + versioned_info = malcontent().infos[item_type].get(identifier) + + info = versioned_info and versioned_info.get_by_ver() + if info is None: + abort(404) + + # no need for send_from_directory(); path is safe, constructed by us + return send_file(malcontent().malcontent_dir_path / item_type / info.path()) + +@bp.route('/mapping/') +def get_newest_mapping(identifier_dot_json: str) -> Response: + return get_resource_or_mapping('mapping', identifier_dot_json) + +@bp.route('/resource/') +def get_newest_resource(identifier_dot_json: str) -> Response: + return get_resource_or_mapping('resource', identifier_dot_json) + +@bp.route('/query') +def query(): + url = request.args['url'] + + mapping_refs = [i.as_query_result() for i in malcontent().query(url)] + result = { + 'api_schema_version': [1], + 'generated_by': { + 'name': 'hydrilla' + }, + 'mappings': mapping_refs + } + + return json.dumps(result) -- cgit v1.2.3