From a14ab0a7601ff5c197fe43d42410d8ed6bfd26a8 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Sat, 13 Nov 2021 20:33:57 +0100 Subject: initial commit --- src/pydrilla/pydrilla.py | 700 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 700 insertions(+) create mode 100644 src/pydrilla/pydrilla.py (limited to 'src/pydrilla/pydrilla.py') diff --git a/src/pydrilla/pydrilla.py b/src/pydrilla/pydrilla.py new file mode 100644 index 0000000..caf05a2 --- /dev/null +++ b/src/pydrilla/pydrilla.py @@ -0,0 +1,700 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +# Main repository logic. +# +# This file is part of Hydrilla +# +# Copyright (C) 2021 Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# +# I, Wojtek Kosior, thereby promise not to sue for violation of this +# file's license. Although I request that you do not make use this code +# in a proprietary program, I am not going to enforce this in court. + +from flask import Flask, Blueprint, current_app, url_for, abort, request +from jinja2 import Environment, PackageLoader +import re +#from hashlib import sha256 +import os +import pathlib +import json +import gettext +import logging + +SCHEMA_VERSION = [0, 2] + +strip_comment_re = re.compile(r''' +^ # match from the beginning of each line +( # catch the part before '//' comment + (?: # this group matches either a string or a single out-of-string character + [^"/] | + " + (?: # this group matches any in-a-string character + [^"\\] | # match any normal character + \\[^u] | # match any escaped character like '\f' or '\n' + \\u[a-fA-F0-9]{4} # match an escape + )* + " + )* +) +# expect either end-of-line or a comment: +# * unterminated strings will cause matching to fail +# * bad comment (with '/' instead of '//') will be indicated by second group +# having length 1 instead of 2 or 0 +(//?|$) +''', re.VERBOSE) + +def strip_json_comments(text): + processed = 0 + stripped_text = [] + for line in text.split('\n'): + match = strip_comment_re.match(line) + + if match is None: # unterminated string + # ignore this error, let json module report it + stripped = line + elif len(match[2]) == 1: + raise json.JSONDecodeError('bad comment', text, + processed + len(match[1])) + else: + stripped = match[1] + + stripped_text.append(stripped) + processed += len(line) + 1 + + return '\n'.join(stripped_text) + +here = pathlib.Path(__file__).resolve().parent + +bp = Blueprint('bp', __package__) + +def load_config(config_path): + config = {} + to_load = [config_path] + failures_ok = [False] + + while to_load: + path = to_load.pop() + can_fail = failures_ok.pop() + + try: + with open(config_path) as config_file: + new_config = json.loads(strip_json_comments(config_file.read())) + except Exception as e: + if can_fail: + continue + raise e from None + + config.update(new_config) + + for key, failure_ok in [('try_configs', True), ('use_configs', False)]: + paths = new_config.get(key, []) + paths.reverse() + to_load.extend(paths) + failures_ok.extend([failure_ok] * len(paths)) + + for key in ['try_configs', 'use_configs']: + if key in config: + config.pop(key) + + return config + +def get_content_file_path(path): + if os.path.sep != '/': + path.replace('/', os.path.sep) + + path = pathlib.Path(path) + if path.is_absolute(): + raise ValueError(_('path_is_absolute_{}').format(path)) + + return path + +class MyNotImplError(NotImplementedError): + '''Raised when a planned but not-yet-completed feature is used.''' + def __init__(self, what, where): + super().__init__(_('not_implemented_{what}_{where}') + .format(what=what, where=where)) + +def normalize_version(ver): + ''' + ver is an array of integers. Strip right-most zeroes from ver. + + Returns a *new* array. Doesn't modify its argument. + ''' + new_len = 0 + for i, num in enumerate(ver): + if num != 0: + new_len = i + 1 + + return ver[:new_len] + +def parse_version(ver_str): + ''' + Convert ver_str into an array representation, e.g. for ver_str="4.6.13.0" + return [4, 6, 13, 0]. + ''' + return [int(num) for num in ver_str.split('.')] + +def version_string(ver, rev=None): + ''' + ver is an array of integers. rev is an optional integer. Produce string + representation of version (optionally with revision number), like: + 1.2.3-5 + No version normalization is performed. + ''' + return '.'.join([str(n) for n in ver]) + ('' if rev is None else f'-{rev}') + +### pad_versions() and compare_versions() likely won't be needed + +# def pad_versions(ver1, ver2): +# ''' +# Each of the arguments is an array of integers. If one of the arrays is +# shorter than the other, right-pad it with zeroes to make it the same +# length as the other one. + +# Returns a tuple of *new* arrays. Doesn't modify its arguments. +# ''' +# if len(ver1) < len(ver2): +# ver2, ver1 = pad_versions(ver2, ver1) +# else: +# ver2 = [*ver2, *([0] * (len(ver1) - len(ver2)))] +# ver1 = [*ver1] + +# return ver1, ver2 + +# def compare_versions(ver1, ver2, rev1=1, rev2=1): +# ''' +# ver1 and ver2 are arrays of integers, with major version number being the +# first array item. If versions specified by arrays of different length need +# to be compared, the shorter array gets padded with zeroes on the right. +# This means that for example version 1.3 could be given as both [1, 3] and +# [1, 3, 0, 0] (aka 1.3.0.0) and either would mean the same. + +# rev1 and rev2 are revision numbers. They are appended to padded ver1 and +# ver2 arrays respectively before comparison. + +# This function returns -1, 0 or 1 when the first ver1 designates +# respectively a version lower than, equal to or greater than the one in +# ver2. +# ''' +# ver1, ver2 = pad_versions(ver1, ver2) +# ver1.append(rev1) +# ver2.append(rev2) + +# for n1, n2 in zip(ver1, ver2): +# if n1 < n2: +# return -1 +# if n1 > n2: +# return 1 + +# return 0 + +class VersionedContentItem: + '''Stores definitions of multiple versions of website content item.''' + def __init__(self): + self.uuid = None + self.identifier = None + self.by_version = {} + self.known_versions = [] + + def register_item(self, item): + if self.identifier is None: + self.identifier = item['identifier'] + self.uuid = item['uuid'] + elif self.uuid != item['uuid']: + raise ValueError(_('uuid_mismatch_{identifier}') + .format(identifier=self.identifier)) + + ver = item['version'] + ver_str = version_string(ver) + + if ver_str in self.by_version: + raise ValueError(_('version_clash_{identifier}_{version}') + .format(identifier=self.identifier, + version=ver_str)) + + self.by_version[ver_str] = item + self.known_versions.append(ver) + +class PatternTreeNode: + ''' + "Pattern Tree" is how we refer to the data structure used for querying + Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal + is to make it possible for given URL to quickly retrieve all known patterns + that match it. + ''' + def __init__(self): + self.wildcard_matches = [None, None, None] + self.literal_match = None + self.children = {} + + def search(self, segments): + ''' + Yields all matches of this segments sequence against the tree that + starts at this node. Results are produces in order from greatest to + lowest pattern specificity. + ''' + nodes = [self] + + for segment in segments: + next_node = nodes[-1].children.get(segment) + if next_node is None: + break + + nodes.append(next_node) + + nsegments = len(segments) + cond_literal = lambda: len(nodes) == nsegments + cond_wildcard = [ + lambda: len(nodes) + 1 == nsegments and segments[-1] != '*', + lambda: len(nodes) + 1 < nsegments, + lambda: len(nodes) + 1 != nsegments or segments[-1] != '***' + ] + + while nodes: + node = nodes.pop() + + for item, condition in [(node.literal_match, cond_literal), + *zip(node.wildcard_matches, cond_wildcard)]: + if item is not None and condition(): + yield item + + def add(self, segments, item_instantiator): + ''' + Make item queryable through (this branch of) the Pattern Tree. If there + was not yet any item associated with the tree path designated by + segments, create a new one using item_instantiator() function. Return + all items matching this path (both the ones that existed and the ones + just created). + ''' + node = self + + for i, segment in enumerate(segments): + wildcards = node.wildcard_matches + + child = node.children.get(segment) or PatternTreeNode() + node.children[segment] = child + node = child + + if node.literal_match is None: + node.literal_match = item_instantiator() + + if segment not in ('*', '**', '***'): + return [node.literal_match] + + if wildcards[len(segment) - 1] is None: + wildcards[len(segment) - 1] = item_instantiator() + + return [node.literal_match, wildcards[len(segment) - 1]] + +proto_regex = re.compile(r'^(?P\w+)://(?P.*)$') +user_re = r'[^/?#@]+@' # r'(?P[^/?#@]+)@' # discarded for now +query_re = r'\??[^#]*' # r'\??(?P[^#]*)' # discarded for now +domain_re = r'(?P[^/?#]+)' +path_re = r'(?P[^?#]*)' +http_regex = re.compile(f'{domain_re}{path_re}{query_re}.*') +ftp_regex = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*') + +class UrlError(ValueError): + pass + +class DeconstructedUrl: + '''Represents a deconstructed URL or URL pattern''' + def __init__(self, url): + self.url = url + + match = proto_regex.match(url) + if not match: + raise UrlError(_('invalid_URL_{}').format(url)) + + self.proto = match.group('proto') + if self.proto not in ('http', 'https', 'ftp'): + raise UrlError(_('disallowed_protocol_{}').format(proto)) + + if self.proto == 'ftp': + match = ftp_regex.match(match.group('rest')) + elif self.proto in ('http', 'https'): + match = http_regex.match(match.group('rest')) + + if not match: + raise UrlError(_('invalid_URL_{}').format(url)) + + self.domain = match.group('domain').split('.') + self.domain.reverse() + self.path = [*filter(None, match.group('path').split('/'))] + +class MappingItem: + ''' + A mapping, together with one of its patterns, as stored in Pattern Tree. + ''' + def __init__(self, pattern, mapping): + self.pattern = pattern + self.mapping = mapping + + def register(self, patterns_by_proto): + ''' + Make self queryable through the Pattern Tree that starts with the + protocols dictionary passed in the argument. + ''' + deco = DeconstructedUrl(self.pattern) + + domain_tree = patterns_by_proto.get(deco.proto) or PatternTreeNode() + patterns_by_proto[deco.proto] = domain_tree + + for path_tree in domain_tree.add(deco.domain, PatternTreeNode): + for match_list in path_tree.add(deco.path, list): + match_list.append(self) + +class Content: + '''Stores serveable website content.''' + def __init__(self): + self.resources = {} + self.mappings = {} + self.licenses = {} + self.indexes = {} + self.definition_processors = { + 'resource': self.process_resource_or_mapping, + 'mapping': self.process_resource_or_mapping, + 'license': self.process_license + } + self.patterns_by_proto = {} + + @staticmethod + def register_item(dict, item): + ''' + Helper function used to add a versioned item definition to content + data structures. + ''' + identifier = item['identifier'] + versioned_item = dict.get(identifier) + if versioned_item is None: + versioned_item = VersionedContentItem() + dict[identifier] = versioned_item + + versioned_item.register_item(item) + + @staticmethod + def _process_copyright_and_license(definition): + '''Helper function used by other process_*() methods.''' + for field in ['copyright', 'licenses']: + if definition[field] == 'auto': + raise MyNotImplError(f'"{{field}}": "auto"', + definition['source_name']) + + def process_resource_or_mapping(self, definition, index): + ''' + Sanitizes, autocompletes and registers serveable mapping/resource + definition. + ''' + definition['version'] = normalize_version(definition['version']) + + if definition['type'] == 'resource': + self._process_copyright_and_license(definition) + definition['dependencies'] = definition.get('dependencies', []) + self.register_item(self.resources, definition) + else: + self.register_item(self.mappings, definition) + + def process_license(self, license, index): + '''Sanitizes and registers serveable license definition.''' + identifier = license['identifier'] + if identifier in self.licenses: + raise ValueError(_('license_clash_{}').format(identifier)) + + self.licenses[identifier] = license + + def process_index(self, index, source_name): + ''' + Sanitizes, autocompletes and registers data from a loaded index.json + file. + ''' + schema_ver = normalize_version(index['schema_version']) + index['schema_version'] = schema_ver + if schema_ver != SCHEMA_VERSION: + raise ValueError('index_json_schema_mismatch_{found}_{required}' + .format(found=version_string(schema_ver), + required=version_string(SCHEMA_VERSION))) + + if source_name in self.indexes: + raise ValueError(_('source_name_clash_{}').format(source_name)) + + index['source_name'] = source_name + + self._process_copyright_and_license(index) + + self.indexes[source_name] = index + + for definition in index['definitions']: + try: + definition['source_name'] = source_name + definition['source_copyright'] = index['copyright'] + definition['source_licenses'] = index['licenses'] + processor = self.definition_processors[definition['type']] + processor(definition, index) + except Exception as e: + if current_app._pydrilla_werror: + raise e from None + logging.error(_('couldnt_load_definition_from_%s'), subdir_path, + exc_info=True) + @staticmethod + def all_items(versioned_items_dict): + '''Iterator over all registered versions of all items.''' + for versioned_item in versioned_items_dict.values(): + for item in versioned_item.by_version.values(): + yield item + + def report_missing(self): + ''' + Use logger to print information about items that are referenced but + were not loaded. + ''' + def report_missing_license(object, object_type, lic): + if object_type == 'index': + logging.error(_('no_index_license_%(source)s_%(lic)s'), + source=object['source_name'], lic=lic) + return + + ver_str = version_string(object['version']) + kwargs = {object_type: object['identifier'], ver: ver_str, lic: lic} + if object_type == 'resource': + fmt = _('no_resource_license_%(resource)s_%(ver)s_%(lic)s') + else: + fmt = _('no_mapping_license_%(mapping)s_%(ver)s_%(lic)s') + + logging.error(fmt, **kwargs) + + for object_type, iterable in [ + ('index', self.indexes.values()), + ('resource', self.all_items(self.resources)) + ]: + for object in iterable: + to_process = [object['licenses']] + licenses = [] + while to_process: + term = to_process.pop() + + if type(term) is str: + if term not in ['or', 'and'] and \ + term not in self.licenses: + report_missing_license(object, object_type, lic) + continue + + to_process.extend(term) + + def report_missing_dependency(resource, dep): + logging.error(_('no_dep_%(resource)s_%(ver)s_%(dep)s'), + dep=dep, resource=resource['identifier'], + ver=version_string(resource['version'])) + + for resource in self.all_items(self.resources): + for dep in resource['dependencies']: + if dep not in self.resources: + report_missing_dependency(resource, dep) + + def report_missing_payload(mapping, payload): + logging.error(_('no_payload_%(mapping)s_%(ver)s_%(payload)s'), + mapping=mapping['identifier'], payload=payload, + ver=version_string(mapping['version'])) + + for mapping in self.all_items(self.mappings): + for payload in mapping['payloads']: + payload = payload['payload'] + if payload not in self.resources: + report_missing_payload(mapping, payload) + + def finalize(self): + ''' + Initialize structures needed to serve queries. Called once after all + data gets loaded. + ''' + for dict in [self.resources, self.mappings]: + for versioned_item in dict.values(): + versioned_item.known_versions.sort() + + for mapping in self.all_items(self.mappings): + for payload in mapping['payloads']: + try: + MappingItem(pattern, mapping)\ + .register(self.patterns_by_proto) + except Exception as e: + if current_app._pydrilla_werror: + raise e from None + logging.error( + _('couldnt_register_%(mapping)s_%(ver)s_%(pattern)s'), + mapping=mapping['identifier'], pattern=pattern, + ver=version_string(mapping['version']) + ) + + def find_item(self, type, identifier, ver=None): + ''' + Find and return definition of the newest version of resource/mapping + named by identifier. If no such resource/mapping exists, return None. + + If ver is specified, instead find and return definition of that version + of the item (or None is absent). + ''' + dict = self.resources if type == 'resource' else self.mappings + versioned_item = dict.get(identifier) + if not versioned_item: + return None + + ver = version_string(ver or versioned_item.known_versions[-1]) + + return versioned_item.by_version.get(ver) + + def query(self, url, max=0): + ''' + Return return registered patterns and mappings (available as + MappingItems) that match url. The maximum number of items yielded may be + limited by using the optional max argument. Its default value, 0, causes + no limit to be imposed. + + If multiple versions of a mapping are applicable, only the most recent + is included in the result. + ''' + deco = DeconstructedUrl(url) + + domain_tree = self.patterns_by_proto.get(deco.proto) \ + or PatternTreeNode() + for path_tree in domain_tree.search(deco.domain): + for item in path_tree.search(deco.path): + if url[-1] == '/' or item.pattern[-1] != '/': + yield item + max -= 1 + if max == 0: + return + +def load_content_from_subdir(subdir_path, source_name, content): + index_path = subdir_path / 'index.json' + with open(index_path) as index_file: + index = json.loads(strip_json_comments(index_file.read())) + + content.process_index(index, source_name) + +def load_content(path): + path = pathlib.Path(path) + if not path.is_dir(): + raise ValueError(_('content_dir_path_not_dir')) + + content = Content() + + for subdir_path in path.iterdir(): + if not subdir_path.is_dir(): + continue + try: + load_content_from_subdir(subdir_path, subdir_path.name, content) + except Exception as e: + if current_app._pydrilla_werror: + raise e from None + logging.error(_('couldnt_load_content_from_%s'), subdir_path, + exc_info=True) + + content.report_missing() + content.finalize() + + return content + +def create_app(config_path=(here / 'config.json'), flask_config={}): + app = Flask(__package__) + app.config.update(flask_config) + + language = flask_config.get('lang', 'en') + translation = gettext.translation('pydrilla', localedir=(here / 'locales'), + languages=[language]) + + app._pydrilla_gettext = translation.gettext + + # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode + if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true': + return app + + config = load_config(config_path) + for key in ['static_resource_uri', 'content_dir']: + if key not in config: + raise ValueError(_('config_key_absent_{}').format(key)) + + app._pydrilla_static_resource_uri = config['static_resource_uri'] + app._pydrilla_werror = config.get('werror', False) + if 'hydrilla_parent' in config: + raise MyNotImplError('hydrilla_parent', config_path.name) + with app.app_context(): + app._pydrilla_content = load_content(config['content_dir']) + + app.register_blueprint(bp) + + return app + +def _(text_key): + return current_app._pydrilla_gettext(text_key) + +def escaping_gettext(text_key): + from markupsafe import escape + + return str(escape(_(text_key))) + +class MyEnvironment(Environment): + ''' + A wrapper class around jinja2.Environment that causes GNU gettext function + (as '_' and '__') and url_for function to be passed to every call of each + template's render() method. + ''' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_template(self, *args, **kwargs): + template = super().get_template(*args, **kwargs) + old_render = template.render + + def new_render(*args, **kwargs): + final_kwargs = { + '_': escaping_gettext, + '__': escaping_gettext, + 'url_for': url_for + } + final_kwargs.update(kwargs) + + return old_render(*args, **final_kwargs) + + template.render = new_render + + return template + +j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False) + +indexpage = j2env.get_template('index.html') +@bp.route('/') +def index(): + return indexpage.render(content=current_app._pydrilla_resources_map) + +for item_type in ['resource', 'mapping']: + def item(identifier): + ver = request.args.get('ver') + if ver is not None: + try: + ver = normalize_version(parse_version(ver)) + except: + abort(400) + + item = current_app._pydrilla_content\ + .find_item(item_type, identifier, ver) + if item is None: + abort(404) + + return json.dumps(item) + + item.__name__ = item_type + 's' + bp.route(f'/{item_type}s/')(item) -- cgit v1.2.3