From 6676b4ed90e19e2fd6ee5f4242cf85f64db145d8 Mon Sep 17 00:00:00 2001
From: Wojtek Kosior <koszko@koszko.org>
Date: Tue, 8 Feb 2022 15:29:49 +0100
Subject: rework Hydrilla to use a separate tool for building its source
 packages

* Hydrilla now depends on "Hydrilla builder" developed at: https://git.koszko.org/hydrilla-builder/
* Hydrilla repository is now REUSE-compliant
* The debian packaging is temporarily not tested and likely to be broken
* JSON schemas are now in use (through 'jsonschema' Python library)
* This is not yet a release and some minor changes to the API on-fisk format are going to occur before that
---
 src/hydrilla/server/serve.py | 604 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 604 insertions(+)
 create mode 100644 src/hydrilla/server/serve.py

(limited to 'src/hydrilla/server/serve.py')

diff --git a/src/hydrilla/server/serve.py b/src/hydrilla/server/serve.py
new file mode 100644
index 0000000..815ac63
--- /dev/null
+++ b/src/hydrilla/server/serve.py
@@ -0,0 +1,604 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+# Main repository logic.
+#
+# This file is part of Hydrilla
+#
+# Copyright (C) 2021, 2022 Wojtek Kosior
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+#
+# I, Wojtek Kosior, thereby promise not to sue for violation of this
+# file's license. Although I request that you do not make use this code
+# in a proprietary program, I am not going to enforce this in court.
+
+import re
+import os
+import pathlib
+import json
+import gettext
+import logging
+
+from pathlib import Path
+from hashlib import sha256
+from abc import ABC, abstractmethod
+from typing import Optional, Union, Iterable
+
+from flask import Flask, Blueprint, current_app, url_for, abort, request, \
+    redirect, send_file
+from jinja2 import Environment, PackageLoader
+from werkzeug import Response
+
+from .. import util
+
+here = pathlib.Path(__file__).resolve().parent
+
+def load_config(config_path: Path) -> dict:
+    config = {}
+    to_load = [config_path]
+    failures_ok = [False]
+
+    while to_load:
+        path = to_load.pop()
+        can_fail = failures_ok.pop()
+
+        try:
+            json_text = util.strip_json_comments(config_path.read_text())
+            new_config = json.loads(json_text)
+        except Exception as e:
+            if can_fail:
+                continue
+            raise e from None
+
+        config.update(new_config)
+
+        for key, failure_ok in [('try_configs', True), ('use_configs', False)]:
+            paths = new_config.get(key, [])
+            paths.reverse()
+            to_load.extend(paths)
+            failures_ok.extend([failure_ok] * len(paths))
+
+    for key in ('try_configs', 'use_configs'):
+        if key in config:
+            config.pop(key)
+
+    for key in ('malcontent_dir', 'hydrilla_project_url'):
+        if key not in config:
+            raise ValueError(_('config_key_absent_{}').format(key))
+
+    malcontent_path = Path(config['malcontent_dir'])
+    if not malcontent_path.is_absolute():
+        malcontent_path = config_path.parent / malcontent_path
+
+    config['malcontent_dir'] = str(malcontent_path.resolve())
+
+    return config
+
+class ItemInfo(ABC):
+    """Shortened data of a resource/mapping."""
+    def __init__(self, item_obj: dict):
+        """Initialize ItemInfo using item definition read from JSON."""
+        self.version    = util.normalize_version(item_obj['version'])
+        self.identifier = item_obj['identifier']
+        self.uuid       = item_obj['uuid']
+        self.long_name  = item_obj['long_name']
+
+    def path(self) -> str:
+        """
+        Get a relative path to this item's JSON definition with respect to
+        directory containing items of this type.
+        """
+        return f'{self.identifier}/{util.version_string(self.version)}'
+
+class ResourceInfo(ItemInfo):
+    """Shortened data of a resource."""
+    def __init__(self, resource_obj: dict):
+        """Initialize ResourceInfo using resource definition read from JSON."""
+        super().__init__(resource_obj)
+
+        self.dependencies = resource_obj.get('dependencies', [])
+
+class MappingInfo(ItemInfo):
+    """Shortened data of a mapping."""
+    def __init__(self, mapping_obj: dict):
+        """Initialize MappingInfo using mapping definition read from JSON."""
+        super().__init__(mapping_obj)
+
+        self.payloads = {}
+        for pattern, res_ref in mapping_obj.get('payloads', {}).items():
+            self.payloads[pattern] = res_ref['identifier']
+
+    def as_query_result(self) -> str:
+        """
+        Produce a json.dump()-able object describing this mapping as one of a
+        collection of query results.
+        """
+        return {
+            'version':    self.version,
+            'identifier': self.identifier,
+            'long_name':  self.long_name
+        }
+
+class VersionedItemInfo:
+    """Stores data of multiple versions of given resource/mapping."""
+    def __init__(self):
+        self.uuid = None
+        self.identifier = None
+        self.by_version = {}
+        self.known_versions = []
+
+    def register(self, item_info: ItemInfo) -> None:
+        """
+        Make item info queryable by version. Perform sanity checks for uuid.
+        """
+        if self.identifier is None:
+            self.identifier = item_info.identifier
+            self.uuid = item_info.uuid
+        elif self.uuid != item_info.uuid:
+            raise ValueError(_('uuid_mismatch_{identifier}')
+                             .format(identifier=self.identifier))
+
+        ver = item_info.version
+        ver_str = util.version_string(ver)
+
+        if ver_str in self.by_version:
+            raise ValueError(_('version_clash_{identifier}_{version}')
+                             .format(identifier=self.identifier,
+                                     version=ver_str))
+
+        self.by_version[ver_str] = item_info
+        self.known_versions.append(ver)
+
+    def get_by_ver(self, ver: Optional[list[int]]=None) -> Optional[ItemInfo]:
+        """
+        Find and return info of the newest version of item.
+
+        If ver is specified, instead find and return info of that version of the
+        item (or None if absent).
+        """
+        ver = util.version_string(ver or self.known_versions[-1])
+
+        return self.by_version.get(ver)
+
+    def get_all(self) -> list[ItemInfo]:
+        """
+        Return a list of item info for all its versions, from oldest ot newest.
+        """
+        return [self.by_version[util.version_string(ver)]
+                for ver in self.known_versions]
+
+class PatternTreeNode:
+    """
+    "Pattern Tree" is how we refer to the data structure used for querying
+    Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
+    is to make it possible for given URL to quickly retrieve all known patterns
+    that match it.
+    """
+    def __init__(self):
+        self.wildcard_matches = [None, None, None]
+        self.literal_match    = None
+        self.children         = {}
+
+    def search(self, segments):
+        """
+        Yields all matches of this segments sequence against the tree that
+        starts at this node. Results are produces in order from greatest to
+        lowest pattern specificity.
+        """
+        nodes = [self]
+
+        for segment in segments:
+            next_node = nodes[-1].children.get(segment)
+            if next_node is None:
+                break
+
+            nodes.append(next_node)
+
+        nsegments = len(segments)
+        cond_literal = lambda: len(nodes)     == nsegments
+        cond_wildcard = [
+            lambda: len(nodes) + 1 == nsegments and segments[-1] != '*',
+            lambda: len(nodes) + 1 <  nsegments,
+            lambda: len(nodes) + 1 != nsegments or  segments[-1] != '***'
+        ]
+
+        while nodes:
+            node = nodes.pop()
+
+            for item, condition in [(node.literal_match, cond_literal),
+                                    *zip(node.wildcard_matches, cond_wildcard)]:
+                if item is not None and condition():
+                    yield item
+
+    def add(self, segments, item_instantiator):
+        """
+        Make item queryable through (this branch of) the Pattern Tree. If there
+        was not yet any item associated with the tree path designated by
+        segments, create a new one using item_instantiator() function. Return
+        all items matching this path (both the ones that existed and the ones
+        just created).
+        """
+        node = self
+        segment = None
+
+        for segment in segments:
+            wildcards = node.wildcard_matches
+
+            child = node.children.get(segment) or PatternTreeNode()
+            node.children[segment] = child
+            node = child
+
+        if node.literal_match is None:
+            node.literal_match = item_instantiator()
+
+        if segment not in ('*', '**', '***'):
+            return [node.literal_match]
+
+        if wildcards[len(segment) - 1] is None:
+            wildcards[len(segment) - 1] = item_instantiator()
+
+        return [node.literal_match, wildcards[len(segment) - 1]]
+
+proto_regex  = re.compile(r'^(?P<proto>\w+)://(?P<rest>.*)$')
+user_re      = r'[^/?#@]+@' # r'(?P<user>[^/?#@]+)@' # discarded for now
+query_re     = r'\??[^#]*'  # r'\??(?P<query>[^#]*)' # discarded for now
+domain_re    = r'(?P<domain>[^/?#]+)'
+path_re      = r'(?P<path>[^?#]*)'
+http_regex   = re.compile(f'{domain_re}{path_re}{query_re}.*')
+ftp_regex    = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*')
+
+class UrlError(ValueError):
+    """Used to report a URL or URL pattern that is invalid or unsupported."""
+    pass
+
+class DeconstructedUrl:
+    """Represents a deconstructed URL or URL pattern"""
+    def __init__(self, url):
+        self.url = url
+
+        match = proto_regex.match(url)
+        if not match:
+            raise UrlError(_('invalid_URL_{}').format(url))
+
+        self.proto = match.group('proto')
+        if self.proto not in ('http', 'https', 'ftp'):
+            raise UrlError(_('disallowed_protocol_{}').format(proto))
+
+        if self.proto == 'ftp':
+            match = ftp_regex.match(match.group('rest'))
+        elif self.proto in ('http', 'https'):
+            match = http_regex.match(match.group('rest'))
+
+        if not match:
+            raise UrlError(_('invalid_URL_{}').format(url))
+
+        self.domain = match.group('domain').split('.')
+        self.domain.reverse()
+        self.path = [*filter(None, match.group('path').split('/'))]
+
+class PatternMapping:
+    """
+    A mapping info, together with one of its patterns, as stored in Pattern
+    Tree.
+    """
+    def __init__(self, pattern: str, mapping_info: MappingInfo):
+        self.pattern = pattern
+        self.mapping_info = mapping_info
+
+    def register(self, pattern_tree: dict):
+        """
+        Make self queryable through the Pattern Tree passed in the argument.
+        """
+        deco = DeconstructedUrl(self.pattern)
+
+        domain_tree = pattern_tree.get(deco.proto) or PatternTreeNode()
+        pattern_tree[deco.proto] = domain_tree
+
+        for path_tree in domain_tree.add(deco.domain, PatternTreeNode):
+            for match_list in path_tree.add(deco.path, list):
+                match_list.append(self)
+
+class Malcontent:
+    """
+    Instance of this class represents a directory with files that can be loaded
+    and served by Hydrilla.
+    """
+    def __init__(self, malcontent_dir_path: Union[Path, str]):
+        """
+        When an instance of Malcontent is constructed, it searches
+        malcontent_dir_path for serveable site-modifying packages and loads
+        them into its data structures.
+        """
+        self.infos = {'resource': {}, 'mapping': {}}
+        self.pattern_tree = {}
+
+        self.malcontent_dir_path = pathlib.Path(malcontent_dir_path).resolve()
+
+        if not self.malcontent_dir_path.is_dir():
+            raise ValueError(_('malcontent_dir_path_not_dir'))
+
+        for item_type in ('mapping', 'resource'):
+            type_path = self.malcontent_dir_path / item_type
+            if not type_path.is_dir():
+                continue
+
+            for subpath in type_path.iterdir():
+                if not subpath.is_dir():
+                    continue
+
+                for ver_file in subpath.iterdir():
+                    try:
+                        self._load_item(item_type, ver_file)
+                    except Exception as e:
+                        if current_app._hydrilla_werror:
+                            raise e from None
+
+                        msg = _('couldnt_load_item_from_{}').format(ver_file)
+                        logging.error(msg, exc_info=True)
+
+        self._report_missing()
+        self._finalize()
+
+    def _load_item(self, item_type: str, ver_file: Path) -> None:
+        """
+        Reads, validates and autocompletes serveable mapping/resource
+        definition, then registers information from it in data structures.
+        """
+        version    = util.parse_version(ver_file.name)
+        identifier = ver_file.parent.name
+
+        with open(ver_file, 'rt') as file_handle:
+            item_json = json.load(file_handle)
+
+        util.validator_for(f'api_{item_type}_description-1.schema.json')\
+            .validate(item_json)
+
+        if item_type == 'resource':
+            item_info = ResourceInfo(item_json)
+        else:
+            item_info = MappingInfo(item_json)
+
+        if item_info.identifier != identifier:
+            msg = _('item_{item}_in_file_{file}')\
+                .format({'item': item_info.identifier, 'file': ver_file})
+            raise ValueError(msg)
+
+        if item_info.version != version:
+            ver_str = util.version_string(item_info.version)
+            msg = _('item_version_{ver}_in_file_{file}')\
+                .format({'ver': ver_str, 'file': ver_file})
+            raise ValueError(msg)
+
+        versioned_info = self.infos[item_type].get(identifier)
+        if versioned_info is None:
+            versioned_info = VersionedItemInfo()
+            self.infos[item_type][identifier] = versioned_info
+
+        versioned_info.register(item_info)
+
+    def _all_of_type(self, item_type: str) -> Iterable[ItemInfo]:
+        """Iterator over all registered versions of all mappings/resources."""
+        for versioned_info in self.infos[item_type].values():
+            for item_info in versioned_info.by_version.values():
+                yield item_info
+
+    def _report_missing(self) -> None:
+        """
+        Use logger to print information about items that are referenced but
+        were not loaded.
+        """
+        def report_missing_dependency(info: ResourceInfo, dep: str) -> None:
+            msg = _('no_dep_%(resource)s_%(ver)s_%(dep)s')\
+                .format(dep=dep, resource=info.identifier,
+                        ver=util.version_string(info.version))
+            logging.error(msg)
+
+        for resource_info in self._all_of_type('resource'):
+            for dep in resource_info.dependencies:
+                if dep not in self.infos['resource']:
+                    report_missing_dependency(resource_info, dep)
+
+        def report_missing_payload(info: MappingInfo, payload: str) -> None:
+            msg = _('no_payload_{mapping}_{ver}_{payload}')\
+                .format(mapping=info.identifier, payload=payload,
+                        ver=util.version_string(info.version))
+            logging.error(msg)
+
+        for mapping_info in self._all_of_type('mapping'):
+            for payload in mapping_info.payloads.values():
+                if payload not in self.infos['resource']:
+                    report_missing_payload(mapping_info, payload)
+
+    def _finalize(self):
+        """
+        Initialize structures needed to serve queries. Called once after all
+        data gets loaded.
+        """
+        for infos_dict in self.infos.values():
+            for versioned_info in infos_dict.values():
+                versioned_info.known_versions.sort()
+
+        for info in self._all_of_type('mapping'):
+            for pattern in info.payloads:
+                try:
+                    PatternMapping(pattern, info).register(self.pattern_tree)
+                except Exception as e:
+                    if current_app._hydrilla_werror:
+                        raise e from None
+                    msg = _('couldnt_register_{mapping}_{ver}_{pattern}')\
+                        .format(mapping=info.identifier, pattern=pattern,
+                                ver=util.version_string(info.version))
+                    logging.error(msg)
+
+    def query(self, url: str) -> list[MappingInfo]:
+        """
+        Return a list of registered mappings that match url.
+
+        If multiple versions of a mapping are applicable, only the most recent
+        is included in the result.
+        """
+        deco = DeconstructedUrl(url)
+
+        collected = {}
+
+        domain_tree = self.pattern_tree.get(deco.proto) or PatternTreeNode()
+
+        def process_mapping(pattern_mapping: PatternMapping) -> None:
+            if url[-1] != '/' and pattern_mapping.pattern[-1] == '/':
+                return
+
+            info = pattern_mapping.mapping_info
+
+            if info.identifier not in collected or \
+               info.version > collected[info.identifier].version:
+                collected[info.identifier] = info
+
+        for path_tree in domain_tree.search(deco.domain):
+            for matches_list in path_tree.search(deco.path):
+                for pattern_mapping in matches_list:
+                    process_mapping(pattern_mapping)
+
+        return list(collected.values())
+
+bp = Blueprint('bp', __package__)
+
+def create_app(config_path: Path=(here / 'config.json'), flask_config: dict={}):
+    """Create the Flask instance."""
+    config = load_config(config_path)
+
+    app = Flask(__package__, static_url_path='/',
+                static_folder=config['malcontent_dir'])
+    app.config.update(flask_config)
+
+    language = flask_config.get('lang', 'en')
+    translation = gettext.translation('hydrilla', localedir=(here / 'locales'),
+                                      languages=[language])
+
+    app._hydrilla_gettext = translation.gettext
+
+    # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode
+    if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true':
+        return app
+
+    app._hydrilla_project_url = config['hydrilla_project_url']
+    app._hydrilla_werror = config.get('werror', False)
+    if 'hydrilla_parent' in config:
+        raise MyNotImplError('hydrilla_parent', config_path.name)
+
+    malcontent_dir = pathlib.Path(config['malcontent_dir'])
+    if not malcontent_dir.is_absolute():
+        malcontent_dir = config_path.parent / malcontent_dir
+    with app.app_context():
+        app._hydrilla_malcontent = Malcontent(malcontent_dir.resolve())
+
+    app.register_blueprint(bp)
+
+    return app
+
+def _(text_key):
+    return current_app._hydrilla_gettext(text_key)
+
+def malcontent():
+    return current_app._hydrilla_malcontent
+
+# TODO: override create_jinja_environment() method of Flask instead of wrapping
+#       Jinja environment
+class MyEnvironment(Environment):
+    """
+    A wrapper class around jinja2.Environment that causes GNU gettext function
+    (as '_' and '__'), url_for function and 'hydrilla_project_url' config option
+    to be passed to every call of each template's render() method.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_template(self, *args, **kwargs):
+        template = super().get_template(*args, **kwargs)
+        old_render = template.render
+
+        def new_render(*args, **kwargs):
+            _ = current_app._hydrilla_gettext
+            project_url = current_app._hydrilla_project_url
+
+            def escaping_gettext(text_key):
+                from markupsafe import escape
+
+                return str(escape(_(text_key)))
+
+            final_kwargs = {
+                '_': escaping_gettext,
+                '__': escaping_gettext,
+                'url_for': url_for,
+                'hydrilla_project_url' : project_url
+            }
+            final_kwargs.update(kwargs)
+
+            return old_render(*args, **final_kwargs)
+
+        template.render = new_render
+
+        return template
+
+j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False)
+
+indexpage = j2env.get_template('index.html')
+@bp.route('/')
+def index():
+    return indexpage.render()
+
+identifier_json_re = re.compile(r'^([-0-9a-z.]+)\.json$')
+
+def get_resource_or_mapping(item_type: str, identifier: str) -> Response:
+    """
+    Strip '.json' from 'identifier', look the item up and send its JSON
+    description.
+    """
+    match = identifier_json_re.match(identifier)
+    if not match:
+        abort(404)
+
+    identifier = match.group(1)
+
+    versioned_info = malcontent().infos[item_type].get(identifier)
+
+    info = versioned_info and versioned_info.get_by_ver()
+    if info is None:
+        abort(404)
+
+    # no need for send_from_directory(); path is safe, constructed by us
+    return send_file(malcontent().malcontent_dir_path / item_type / info.path())
+
+@bp.route('/mapping/<string:identifier_dot_json>')
+def get_newest_mapping(identifier_dot_json: str) -> Response:
+    return get_resource_or_mapping('mapping', identifier_dot_json)
+
+@bp.route('/resource/<string:identifier_dot_json>')
+def get_newest_resource(identifier_dot_json: str) -> Response:
+    return get_resource_or_mapping('resource', identifier_dot_json)
+
+@bp.route('/query')
+def query():
+    url = request.args['url']
+
+    mapping_refs = [i.as_query_result() for i in malcontent().query(url)]
+    result = {
+        'api_schema_version': [1],
+        'generated_by': {
+            'name':    'hydrilla'
+        },
+        'mappings': mapping_refs
+    }
+
+    return json.dumps(result)
-- 
cgit v1.2.3