From a14ab0a7601ff5c197fe43d42410d8ed6bfd26a8 Mon Sep 17 00:00:00 2001
From: Wojtek Kosior <koszko@koszko.org>
Date: Sat, 13 Nov 2021 20:33:57 +0100
Subject: initial commit

---
 src/pydrilla/pydrilla.py | 700 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 700 insertions(+)
 create mode 100644 src/pydrilla/pydrilla.py

(limited to 'src/pydrilla/pydrilla.py')

diff --git a/src/pydrilla/pydrilla.py b/src/pydrilla/pydrilla.py
new file mode 100644
index 0000000..caf05a2
--- /dev/null
+++ b/src/pydrilla/pydrilla.py
@@ -0,0 +1,700 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+# Main repository logic.
+#
+# This file is part of Hydrilla
+#
+# Copyright (C) 2021 Wojtek Kosior
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+#
+# I, Wojtek Kosior, thereby promise not to sue for violation of this
+# file's license. Although I request that you do not make use this code
+# in a proprietary program, I am not going to enforce this in court.
+
+from flask import Flask, Blueprint, current_app, url_for, abort, request
+from jinja2 import Environment, PackageLoader
+import re
+#from hashlib import sha256
+import os
+import pathlib
+import json
+import gettext
+import logging
+
+SCHEMA_VERSION = [0, 2]
+
+strip_comment_re = re.compile(r'''
+^ # match from the beginning of each line
+( # catch the part before '//' comment
+  (?: # this group matches either a string or a single out-of-string character
+    [^"/] |
+    "
+    (?: # this group matches any in-a-string character
+      [^"\\] |          # match any normal character
+      \\[^u] |          # match any escaped character like '\f' or '\n'
+      \\u[a-fA-F0-9]{4} # match an escape
+    )*
+    "
+  )*
+)
+# expect either end-of-line or a comment:
+# * unterminated strings will cause matching to fail
+# * bad comment (with '/' instead of '//') will be indicated by second group
+#   having length 1 instead of 2 or 0
+(//?|$)
+''', re.VERBOSE)
+
+def strip_json_comments(text):
+    processed = 0
+    stripped_text = []
+    for line in text.split('\n'):
+        match = strip_comment_re.match(line)
+
+        if match is None: # unterminated string
+            # ignore this error, let json module report it
+            stripped = line
+        elif len(match[2]) == 1:
+            raise json.JSONDecodeError('bad comment', text,
+                                       processed + len(match[1]))
+        else:
+            stripped = match[1]
+
+        stripped_text.append(stripped)
+        processed += len(line) + 1
+
+    return '\n'.join(stripped_text)
+
+here = pathlib.Path(__file__).resolve().parent
+
+bp = Blueprint('bp', __package__)
+
+def load_config(config_path):
+    config = {}
+    to_load = [config_path]
+    failures_ok = [False]
+
+    while to_load:
+        path = to_load.pop()
+        can_fail = failures_ok.pop()
+
+        try:
+            with open(config_path) as config_file:
+                new_config = json.loads(strip_json_comments(config_file.read()))
+        except Exception as e:
+            if can_fail:
+                continue
+            raise e from None
+
+        config.update(new_config)
+
+        for key, failure_ok in [('try_configs', True), ('use_configs', False)]:
+            paths = new_config.get(key, [])
+            paths.reverse()
+            to_load.extend(paths)
+            failures_ok.extend([failure_ok] * len(paths))
+
+    for key in ['try_configs', 'use_configs']:
+        if key in config:
+            config.pop(key)
+
+    return config
+
+def get_content_file_path(path):
+    if os.path.sep != '/':
+        path.replace('/', os.path.sep)
+
+    path = pathlib.Path(path)
+    if path.is_absolute():
+        raise ValueError(_('path_is_absolute_{}').format(path))
+
+    return path
+
+class MyNotImplError(NotImplementedError):
+    '''Raised when a planned but not-yet-completed feature is used.'''
+    def __init__(self, what, where):
+        super().__init__(_('not_implemented_{what}_{where}')
+                         .format(what=what, where=where))
+
+def normalize_version(ver):
+    '''
+    ver is an array of integers. Strip right-most zeroes from ver.
+
+    Returns a *new* array. Doesn't modify its argument.
+    '''
+    new_len = 0
+    for i, num in enumerate(ver):
+        if num != 0:
+            new_len = i + 1
+
+    return ver[:new_len]
+
+def parse_version(ver_str):
+    '''
+    Convert ver_str into an array representation, e.g. for ver_str="4.6.13.0"
+    return [4, 6, 13, 0].
+    '''
+    return [int(num) for num in ver_str.split('.')]
+
+def version_string(ver, rev=None):
+    '''
+    ver is an array of integers. rev is an optional integer. Produce string
+    representation of version (optionally with revision number), like:
+        1.2.3-5
+    No version normalization is performed.
+    '''
+    return '.'.join([str(n) for n in ver]) + ('' if rev is None else f'-{rev}')
+
+### pad_versions() and compare_versions() likely won't be needed
+
+# def pad_versions(ver1, ver2):
+#     '''
+#     Each of the arguments is an array of integers. If one of the arrays is
+#     shorter than the other, right-pad it with zeroes to make it the same
+#     length as the other one.
+
+#     Returns a tuple of *new* arrays. Doesn't modify its arguments.
+#     '''
+#     if len(ver1) < len(ver2):
+#         ver2, ver1 = pad_versions(ver2, ver1)
+#     else:
+#         ver2 = [*ver2, *([0] * (len(ver1) - len(ver2)))]
+#         ver1 = [*ver1]
+
+#     return ver1, ver2
+
+# def compare_versions(ver1, ver2, rev1=1, rev2=1):
+#     '''
+#     ver1 and ver2 are arrays of integers, with major version number being the
+#     first array item. If versions specified by arrays of different length need
+#     to be compared, the shorter array gets padded with zeroes on the right.
+#     This means that for example version 1.3 could be given as both [1, 3] and
+#     [1, 3, 0, 0] (aka 1.3.0.0) and either would mean the same.
+
+#     rev1 and rev2 are revision numbers. They are appended to padded ver1 and
+#     ver2 arrays respectively before comparison.
+
+#     This function returns -1, 0 or 1 when the first ver1 designates
+#     respectively a version lower than, equal to or greater than the one in
+#     ver2.
+#     '''
+#     ver1, ver2 = pad_versions(ver1, ver2)
+#     ver1.append(rev1)
+#     ver2.append(rev2)
+
+#     for n1, n2 in zip(ver1, ver2):
+#         if n1 < n2:
+#             return -1
+#         if n1 > n2:
+#             return 1
+
+#     return 0
+
+class VersionedContentItem:
+    '''Stores definitions of multiple versions of website content item.'''
+    def __init__(self):
+        self.uuid = None
+        self.identifier = None
+        self.by_version = {}
+        self.known_versions = []
+
+    def register_item(self, item):
+        if self.identifier is None:
+            self.identifier = item['identifier']
+            self.uuid = item['uuid']
+        elif self.uuid != item['uuid']:
+            raise ValueError(_('uuid_mismatch_{identifier}')
+                             .format(identifier=self.identifier))
+
+        ver = item['version']
+        ver_str = version_string(ver)
+
+        if ver_str in self.by_version:
+            raise ValueError(_('version_clash_{identifier}_{version}')
+                             .format(identifier=self.identifier,
+                                     version=ver_str))
+
+        self.by_version[ver_str] = item
+        self.known_versions.append(ver)
+
+class PatternTreeNode:
+    '''
+    "Pattern Tree" is how we refer to the data structure used for querying
+    Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
+    is to make it possible for given URL to quickly retrieve all known patterns
+    that match it.
+    '''
+    def __init__(self):
+        self.wildcard_matches = [None, None, None]
+        self.literal_match    = None
+        self.children         = {}
+
+    def search(self, segments):
+        '''
+        Yields all matches of this segments sequence against the tree that
+        starts at this node. Results are produces in order from greatest to
+        lowest pattern specificity.
+        '''
+        nodes = [self]
+
+        for segment in segments:
+            next_node = nodes[-1].children.get(segment)
+            if next_node is None:
+                break
+
+            nodes.append(next_node)
+
+        nsegments = len(segments)
+        cond_literal = lambda: len(nodes)     == nsegments
+        cond_wildcard = [
+            lambda: len(nodes) + 1 == nsegments and segments[-1] != '*',
+            lambda: len(nodes) + 1 <  nsegments,
+            lambda: len(nodes) + 1 != nsegments or  segments[-1] != '***'
+        ]
+
+        while nodes:
+            node = nodes.pop()
+
+            for item, condition in [(node.literal_match, cond_literal),
+                                    *zip(node.wildcard_matches, cond_wildcard)]:
+                if item is not None and condition():
+                    yield item
+
+    def add(self, segments, item_instantiator):
+        '''
+        Make item queryable through (this branch of) the Pattern Tree. If there
+        was not yet any item associated with the tree path designated by
+        segments, create a new one using item_instantiator() function. Return
+        all items matching this path (both the ones that existed and the ones
+        just created).
+        '''
+        node = self
+
+        for i, segment in enumerate(segments):
+            wildcards = node.wildcard_matches
+
+            child = node.children.get(segment) or PatternTreeNode()
+            node.children[segment] = child
+            node = child
+
+        if node.literal_match is None:
+            node.literal_match = item_instantiator()
+
+        if segment not in ('*', '**', '***'):
+            return [node.literal_match]
+
+        if wildcards[len(segment) - 1] is None:
+            wildcards[len(segment) - 1] = item_instantiator()
+
+        return [node.literal_match, wildcards[len(segment) - 1]]
+
+proto_regex  = re.compile(r'^(?P<proto>\w+)://(?P<rest>.*)$')
+user_re      = r'[^/?#@]+@' # r'(?P<user>[^/?#@]+)@' # discarded for now
+query_re     = r'\??[^#]*'  # r'\??(?P<query>[^#]*)' # discarded for now
+domain_re    = r'(?P<domain>[^/?#]+)'
+path_re      = r'(?P<path>[^?#]*)'
+http_regex   = re.compile(f'{domain_re}{path_re}{query_re}.*')
+ftp_regex    = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*')
+
+class UrlError(ValueError):
+    pass
+
+class DeconstructedUrl:
+    '''Represents a deconstructed URL or URL pattern'''
+    def __init__(self, url):
+        self.url = url
+
+        match = proto_regex.match(url)
+        if not match:
+            raise UrlError(_('invalid_URL_{}').format(url))
+
+        self.proto = match.group('proto')
+        if self.proto not in ('http', 'https', 'ftp'):
+            raise UrlError(_('disallowed_protocol_{}').format(proto))
+
+        if self.proto == 'ftp':
+            match = ftp_regex.match(match.group('rest'))
+        elif self.proto in ('http', 'https'):
+            match = http_regex.match(match.group('rest'))
+
+        if not match:
+            raise UrlError(_('invalid_URL_{}').format(url))
+
+        self.domain = match.group('domain').split('.')
+        self.domain.reverse()
+        self.path = [*filter(None, match.group('path').split('/'))]
+
+class MappingItem:
+    '''
+    A mapping, together with one of its patterns, as stored in Pattern Tree.
+    '''
+    def __init__(self, pattern, mapping):
+        self.pattern = pattern
+        self.mapping = mapping
+
+    def register(self, patterns_by_proto):
+        '''
+        Make self queryable through the Pattern Tree that starts with the
+        protocols dictionary passed in the argument.
+        '''
+        deco = DeconstructedUrl(self.pattern)
+
+        domain_tree = patterns_by_proto.get(deco.proto) or PatternTreeNode()
+        patterns_by_proto[deco.proto] = domain_tree
+
+        for path_tree in domain_tree.add(deco.domain, PatternTreeNode):
+            for match_list in path_tree.add(deco.path, list):
+                match_list.append(self)
+
+class Content:
+    '''Stores serveable website content.'''
+    def __init__(self):
+        self.resources = {}
+        self.mappings  = {}
+        self.licenses  = {}
+        self.indexes   = {}
+        self.definition_processors = {
+            'resource': self.process_resource_or_mapping,
+            'mapping': self.process_resource_or_mapping,
+            'license': self.process_license
+        }
+        self.patterns_by_proto = {}
+
+    @staticmethod
+    def register_item(dict, item):
+        '''
+        Helper function used to add a versioned item definition to content
+        data structures.
+        '''
+        identifier = item['identifier']
+        versioned_item = dict.get(identifier)
+        if versioned_item is None:
+            versioned_item = VersionedContentItem()
+            dict[identifier] = versioned_item
+
+        versioned_item.register_item(item)
+
+    @staticmethod
+    def _process_copyright_and_license(definition):
+        '''Helper function used by other process_*() methods.'''
+        for field in ['copyright', 'licenses']:
+            if definition[field] == 'auto':
+                raise MyNotImplError(f'"{{field}}": "auto"',
+                                     definition['source_name'])
+
+    def process_resource_or_mapping(self, definition, index):
+        '''
+        Sanitizes, autocompletes and registers serveable mapping/resource
+        definition.
+        '''
+        definition['version'] = normalize_version(definition['version'])
+
+        if definition['type'] == 'resource':
+            self._process_copyright_and_license(definition)
+            definition['dependencies'] = definition.get('dependencies', [])
+            self.register_item(self.resources, definition)
+        else:
+            self.register_item(self.mappings, definition)
+
+    def process_license(self, license, index):
+        '''Sanitizes and registers serveable license definition.'''
+        identifier = license['identifier']
+        if identifier in self.licenses:
+            raise ValueError(_('license_clash_{}').format(identifier))
+
+        self.licenses[identifier] = license
+
+    def process_index(self, index, source_name):
+        '''
+        Sanitizes, autocompletes and registers data from a loaded index.json
+        file.
+        '''
+        schema_ver = normalize_version(index['schema_version'])
+        index['schema_version'] = schema_ver
+        if schema_ver != SCHEMA_VERSION:
+            raise ValueError('index_json_schema_mismatch_{found}_{required}'
+                             .format(found=version_string(schema_ver),
+                                     required=version_string(SCHEMA_VERSION)))
+
+        if source_name in self.indexes:
+            raise ValueError(_('source_name_clash_{}').format(source_name))
+
+        index['source_name'] = source_name
+
+        self._process_copyright_and_license(index)
+
+        self.indexes[source_name] = index
+
+        for definition in index['definitions']:
+            try:
+                definition['source_name'] = source_name
+                definition['source_copyright'] = index['copyright']
+                definition['source_licenses'] = index['licenses']
+                processor = self.definition_processors[definition['type']]
+                processor(definition, index)
+            except Exception as e:
+                if current_app._pydrilla_werror:
+                    raise e from None
+                logging.error(_('couldnt_load_definition_from_%s'), subdir_path,
+                              exc_info=True)
+    @staticmethod
+    def all_items(versioned_items_dict):
+        '''Iterator over all registered versions of all items.'''
+        for versioned_item in versioned_items_dict.values():
+            for item in versioned_item.by_version.values():
+                yield item
+
+    def report_missing(self):
+        '''
+        Use logger to print information about items that are referenced but
+        were not loaded.
+        '''
+        def report_missing_license(object, object_type, lic):
+            if object_type == 'index':
+                logging.error(_('no_index_license_%(source)s_%(lic)s'),
+                              source=object['source_name'], lic=lic)
+                return
+
+            ver_str = version_string(object['version'])
+            kwargs = {object_type: object['identifier'], ver: ver_str, lic: lic}
+            if object_type == 'resource':
+                fmt = _('no_resource_license_%(resource)s_%(ver)s_%(lic)s')
+            else:
+                fmt = _('no_mapping_license_%(mapping)s_%(ver)s_%(lic)s')
+
+            logging.error(fmt, **kwargs)
+
+        for object_type, iterable in [
+                ('index',    self.indexes.values()),
+                ('resource', self.all_items(self.resources))
+        ]:
+            for object in iterable:
+                to_process = [object['licenses']]
+                licenses = []
+                while to_process:
+                    term = to_process.pop()
+
+                    if type(term) is str:
+                        if term not in ['or', 'and'] and \
+                           term not in self.licenses:
+                            report_missing_license(object, object_type, lic)
+                        continue
+
+                    to_process.extend(term)
+
+        def report_missing_dependency(resource, dep):
+            logging.error(_('no_dep_%(resource)s_%(ver)s_%(dep)s'),
+                          dep=dep, resource=resource['identifier'],
+                          ver=version_string(resource['version']))
+
+        for resource in self.all_items(self.resources):
+            for dep in resource['dependencies']:
+                if dep not in self.resources:
+                    report_missing_dependency(resource, dep)
+
+        def report_missing_payload(mapping, payload):
+            logging.error(_('no_payload_%(mapping)s_%(ver)s_%(payload)s'),
+                          mapping=mapping['identifier'], payload=payload,
+                          ver=version_string(mapping['version']))
+
+        for mapping in self.all_items(self.mappings):
+            for payload in mapping['payloads']:
+                payload = payload['payload']
+                if payload not in self.resources:
+                    report_missing_payload(mapping, payload)
+
+    def finalize(self):
+        '''
+        Initialize structures needed to serve queries. Called once after all
+        data gets loaded.
+        '''
+        for dict in [self.resources, self.mappings]:
+            for versioned_item in dict.values():
+                versioned_item.known_versions.sort()
+
+        for mapping in self.all_items(self.mappings):
+            for payload in mapping['payloads']:
+                try:
+                    MappingItem(pattern, mapping)\
+                        .register(self.patterns_by_proto)
+                except Exception as e:
+                    if current_app._pydrilla_werror:
+                        raise e from None
+                    logging.error(
+                        _('couldnt_register_%(mapping)s_%(ver)s_%(pattern)s'),
+                        mapping=mapping['identifier'], pattern=pattern,
+                        ver=version_string(mapping['version'])
+                    )
+
+    def find_item(self, type, identifier, ver=None):
+        '''
+        Find and return definition of the newest version of resource/mapping
+        named by identifier. If no such resource/mapping exists, return None.
+
+        If ver is specified, instead find and return definition of that version
+        of the item (or None is absent).
+        '''
+        dict = self.resources if type == 'resource' else self.mappings
+        versioned_item = dict.get(identifier)
+        if not versioned_item:
+            return None
+
+        ver = version_string(ver or versioned_item.known_versions[-1])
+
+        return versioned_item.by_version.get(ver)
+
+    def query(self, url, max=0):
+        '''
+        Return return registered patterns and mappings (available as
+        MappingItems) that match url. The maximum number of items yielded may be
+        limited by using the optional max argument. Its default value, 0, causes
+        no limit to be imposed.
+
+        If multiple versions of a mapping are applicable, only the most recent
+        is included in the result.
+        '''
+        deco = DeconstructedUrl(url)
+
+        domain_tree = self.patterns_by_proto.get(deco.proto) \
+            or PatternTreeNode()
+        for path_tree in domain_tree.search(deco.domain):
+            for item in path_tree.search(deco.path):
+                if url[-1] == '/' or item.pattern[-1] != '/':
+                    yield item
+                    max -= 1
+                    if max == 0:
+                        return
+
+def load_content_from_subdir(subdir_path, source_name, content):
+    index_path = subdir_path / 'index.json'
+    with open(index_path) as index_file:
+        index = json.loads(strip_json_comments(index_file.read()))
+
+    content.process_index(index, source_name)
+
+def load_content(path):
+    path = pathlib.Path(path)
+    if not path.is_dir():
+        raise ValueError(_('content_dir_path_not_dir'))
+
+    content = Content()
+
+    for subdir_path in path.iterdir():
+        if not subdir_path.is_dir():
+            continue
+        try:
+            load_content_from_subdir(subdir_path, subdir_path.name, content)
+        except Exception as e:
+            if current_app._pydrilla_werror:
+                raise e from None
+            logging.error(_('couldnt_load_content_from_%s'), subdir_path,
+                          exc_info=True)
+
+    content.report_missing()
+    content.finalize()
+
+    return content
+
+def create_app(config_path=(here / 'config.json'), flask_config={}):
+    app = Flask(__package__)
+    app.config.update(flask_config)
+
+    language = flask_config.get('lang', 'en')
+    translation = gettext.translation('pydrilla', localedir=(here / 'locales'),
+                                      languages=[language])
+
+    app._pydrilla_gettext = translation.gettext
+
+    # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode
+    if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true':
+        return app
+
+    config = load_config(config_path)
+    for key in ['static_resource_uri', 'content_dir']:
+        if key not in config:
+            raise ValueError(_('config_key_absent_{}').format(key))
+
+    app._pydrilla_static_resource_uri = config['static_resource_uri']
+    app._pydrilla_werror = config.get('werror', False)
+    if 'hydrilla_parent' in config:
+        raise MyNotImplError('hydrilla_parent', config_path.name)
+    with app.app_context():
+        app._pydrilla_content = load_content(config['content_dir'])
+
+    app.register_blueprint(bp)
+
+    return app
+
+def _(text_key):
+    return current_app._pydrilla_gettext(text_key)
+
+def escaping_gettext(text_key):
+    from markupsafe import escape
+
+    return str(escape(_(text_key)))
+
+class MyEnvironment(Environment):
+    '''
+    A wrapper class around jinja2.Environment that causes GNU gettext function
+    (as '_' and '__') and url_for function to be passed to every call of each
+    template's render() method.
+    '''
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_template(self, *args, **kwargs):
+        template = super().get_template(*args, **kwargs)
+        old_render = template.render
+
+        def new_render(*args, **kwargs):
+            final_kwargs = {
+                '_': escaping_gettext,
+                '__': escaping_gettext,
+                'url_for': url_for
+            }
+            final_kwargs.update(kwargs)
+
+            return old_render(*args, **final_kwargs)
+
+        template.render = new_render
+
+        return template
+
+j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False)
+
+indexpage = j2env.get_template('index.html')
+@bp.route('/')
+def index():
+    return indexpage.render(content=current_app._pydrilla_resources_map)
+
+for item_type in ['resource', 'mapping']:
+    def item(identifier):
+        ver = request.args.get('ver')
+        if ver is not None:
+            try:
+                ver = normalize_version(parse_version(ver))
+            except:
+                abort(400)
+
+        item = current_app._pydrilla_content\
+                          .find_item(item_type, identifier, ver)
+        if item is None:
+            abort(404)
+
+        return json.dumps(item)
+
+    item.__name__ = item_type + 's'
+    bp.route(f'/{item_type}s/<string:identifier>')(item)
-- 
cgit v1.2.3