aboutsummaryrefslogtreecommitdiff
path: root/src/pydrilla/pydrilla.py
diff options
context:
space:
mode:
authorWojtek Kosior <koszko@koszko.org>2021-11-13 20:33:57 +0100
committerWojtek Kosior <koszko@koszko.org>2021-11-13 20:33:57 +0100
commita14ab0a7601ff5c197fe43d42410d8ed6bfd26a8 (patch)
treebefa6fc0b1de552bae1e2a832a25cb0dd8f58412 /src/pydrilla/pydrilla.py
downloadhaketilo-hydrilla-a14ab0a7601ff5c197fe43d42410d8ed6bfd26a8.tar.gz
haketilo-hydrilla-a14ab0a7601ff5c197fe43d42410d8ed6bfd26a8.zip
initial commit
Diffstat (limited to 'src/pydrilla/pydrilla.py')
-rw-r--r--src/pydrilla/pydrilla.py700
1 files changed, 700 insertions, 0 deletions
diff --git a/src/pydrilla/pydrilla.py b/src/pydrilla/pydrilla.py
new file mode 100644
index 0000000..caf05a2
--- /dev/null
+++ b/src/pydrilla/pydrilla.py
@@ -0,0 +1,700 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+# Main repository logic.
+#
+# This file is part of Hydrilla
+#
+# Copyright (C) 2021 Wojtek Kosior
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#
+# I, Wojtek Kosior, thereby promise not to sue for violation of this
+# file's license. Although I request that you do not make use this code
+# in a proprietary program, I am not going to enforce this in court.
+
+from flask import Flask, Blueprint, current_app, url_for, abort, request
+from jinja2 import Environment, PackageLoader
+import re
+#from hashlib import sha256
+import os
+import pathlib
+import json
+import gettext
+import logging
+
+SCHEMA_VERSION = [0, 2]
+
+strip_comment_re = re.compile(r'''
+^ # match from the beginning of each line
+( # catch the part before '//' comment
+ (?: # this group matches either a string or a single out-of-string character
+ [^"/] |
+ "
+ (?: # this group matches any in-a-string character
+ [^"\\] | # match any normal character
+ \\[^u] | # match any escaped character like '\f' or '\n'
+ \\u[a-fA-F0-9]{4} # match an escape
+ )*
+ "
+ )*
+)
+# expect either end-of-line or a comment:
+# * unterminated strings will cause matching to fail
+# * bad comment (with '/' instead of '//') will be indicated by second group
+# having length 1 instead of 2 or 0
+(//?|$)
+''', re.VERBOSE)
+
+def strip_json_comments(text):
+ processed = 0
+ stripped_text = []
+ for line in text.split('\n'):
+ match = strip_comment_re.match(line)
+
+ if match is None: # unterminated string
+ # ignore this error, let json module report it
+ stripped = line
+ elif len(match[2]) == 1:
+ raise json.JSONDecodeError('bad comment', text,
+ processed + len(match[1]))
+ else:
+ stripped = match[1]
+
+ stripped_text.append(stripped)
+ processed += len(line) + 1
+
+ return '\n'.join(stripped_text)
+
+here = pathlib.Path(__file__).resolve().parent
+
+bp = Blueprint('bp', __package__)
+
+def load_config(config_path):
+ config = {}
+ to_load = [config_path]
+ failures_ok = [False]
+
+ while to_load:
+ path = to_load.pop()
+ can_fail = failures_ok.pop()
+
+ try:
+ with open(config_path) as config_file:
+ new_config = json.loads(strip_json_comments(config_file.read()))
+ except Exception as e:
+ if can_fail:
+ continue
+ raise e from None
+
+ config.update(new_config)
+
+ for key, failure_ok in [('try_configs', True), ('use_configs', False)]:
+ paths = new_config.get(key, [])
+ paths.reverse()
+ to_load.extend(paths)
+ failures_ok.extend([failure_ok] * len(paths))
+
+ for key in ['try_configs', 'use_configs']:
+ if key in config:
+ config.pop(key)
+
+ return config
+
+def get_content_file_path(path):
+ if os.path.sep != '/':
+ path.replace('/', os.path.sep)
+
+ path = pathlib.Path(path)
+ if path.is_absolute():
+ raise ValueError(_('path_is_absolute_{}').format(path))
+
+ return path
+
+class MyNotImplError(NotImplementedError):
+ '''Raised when a planned but not-yet-completed feature is used.'''
+ def __init__(self, what, where):
+ super().__init__(_('not_implemented_{what}_{where}')
+ .format(what=what, where=where))
+
+def normalize_version(ver):
+ '''
+ ver is an array of integers. Strip right-most zeroes from ver.
+
+ Returns a *new* array. Doesn't modify its argument.
+ '''
+ new_len = 0
+ for i, num in enumerate(ver):
+ if num != 0:
+ new_len = i + 1
+
+ return ver[:new_len]
+
+def parse_version(ver_str):
+ '''
+ Convert ver_str into an array representation, e.g. for ver_str="4.6.13.0"
+ return [4, 6, 13, 0].
+ '''
+ return [int(num) for num in ver_str.split('.')]
+
+def version_string(ver, rev=None):
+ '''
+ ver is an array of integers. rev is an optional integer. Produce string
+ representation of version (optionally with revision number), like:
+ 1.2.3-5
+ No version normalization is performed.
+ '''
+ return '.'.join([str(n) for n in ver]) + ('' if rev is None else f'-{rev}')
+
+### pad_versions() and compare_versions() likely won't be needed
+
+# def pad_versions(ver1, ver2):
+# '''
+# Each of the arguments is an array of integers. If one of the arrays is
+# shorter than the other, right-pad it with zeroes to make it the same
+# length as the other one.
+
+# Returns a tuple of *new* arrays. Doesn't modify its arguments.
+# '''
+# if len(ver1) < len(ver2):
+# ver2, ver1 = pad_versions(ver2, ver1)
+# else:
+# ver2 = [*ver2, *([0] * (len(ver1) - len(ver2)))]
+# ver1 = [*ver1]
+
+# return ver1, ver2
+
+# def compare_versions(ver1, ver2, rev1=1, rev2=1):
+# '''
+# ver1 and ver2 are arrays of integers, with major version number being the
+# first array item. If versions specified by arrays of different length need
+# to be compared, the shorter array gets padded with zeroes on the right.
+# This means that for example version 1.3 could be given as both [1, 3] and
+# [1, 3, 0, 0] (aka 1.3.0.0) and either would mean the same.
+
+# rev1 and rev2 are revision numbers. They are appended to padded ver1 and
+# ver2 arrays respectively before comparison.
+
+# This function returns -1, 0 or 1 when the first ver1 designates
+# respectively a version lower than, equal to or greater than the one in
+# ver2.
+# '''
+# ver1, ver2 = pad_versions(ver1, ver2)
+# ver1.append(rev1)
+# ver2.append(rev2)
+
+# for n1, n2 in zip(ver1, ver2):
+# if n1 < n2:
+# return -1
+# if n1 > n2:
+# return 1
+
+# return 0
+
+class VersionedContentItem:
+ '''Stores definitions of multiple versions of website content item.'''
+ def __init__(self):
+ self.uuid = None
+ self.identifier = None
+ self.by_version = {}
+ self.known_versions = []
+
+ def register_item(self, item):
+ if self.identifier is None:
+ self.identifier = item['identifier']
+ self.uuid = item['uuid']
+ elif self.uuid != item['uuid']:
+ raise ValueError(_('uuid_mismatch_{identifier}')
+ .format(identifier=self.identifier))
+
+ ver = item['version']
+ ver_str = version_string(ver)
+
+ if ver_str in self.by_version:
+ raise ValueError(_('version_clash_{identifier}_{version}')
+ .format(identifier=self.identifier,
+ version=ver_str))
+
+ self.by_version[ver_str] = item
+ self.known_versions.append(ver)
+
+class PatternTreeNode:
+ '''
+ "Pattern Tree" is how we refer to the data structure used for querying
+ Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
+ is to make it possible for given URL to quickly retrieve all known patterns
+ that match it.
+ '''
+ def __init__(self):
+ self.wildcard_matches = [None, None, None]
+ self.literal_match = None
+ self.children = {}
+
+ def search(self, segments):
+ '''
+ Yields all matches of this segments sequence against the tree that
+ starts at this node. Results are produces in order from greatest to
+ lowest pattern specificity.
+ '''
+ nodes = [self]
+
+ for segment in segments:
+ next_node = nodes[-1].children.get(segment)
+ if next_node is None:
+ break
+
+ nodes.append(next_node)
+
+ nsegments = len(segments)
+ cond_literal = lambda: len(nodes) == nsegments
+ cond_wildcard = [
+ lambda: len(nodes) + 1 == nsegments and segments[-1] != '*',
+ lambda: len(nodes) + 1 < nsegments,
+ lambda: len(nodes) + 1 != nsegments or segments[-1] != '***'
+ ]
+
+ while nodes:
+ node = nodes.pop()
+
+ for item, condition in [(node.literal_match, cond_literal),
+ *zip(node.wildcard_matches, cond_wildcard)]:
+ if item is not None and condition():
+ yield item
+
+ def add(self, segments, item_instantiator):
+ '''
+ Make item queryable through (this branch of) the Pattern Tree. If there
+ was not yet any item associated with the tree path designated by
+ segments, create a new one using item_instantiator() function. Return
+ all items matching this path (both the ones that existed and the ones
+ just created).
+ '''
+ node = self
+
+ for i, segment in enumerate(segments):
+ wildcards = node.wildcard_matches
+
+ child = node.children.get(segment) or PatternTreeNode()
+ node.children[segment] = child
+ node = child
+
+ if node.literal_match is None:
+ node.literal_match = item_instantiator()
+
+ if segment not in ('*', '**', '***'):
+ return [node.literal_match]
+
+ if wildcards[len(segment) - 1] is None:
+ wildcards[len(segment) - 1] = item_instantiator()
+
+ return [node.literal_match, wildcards[len(segment) - 1]]
+
+proto_regex = re.compile(r'^(?P<proto>\w+)://(?P<rest>.*)$')
+user_re = r'[^/?#@]+@' # r'(?P<user>[^/?#@]+)@' # discarded for now
+query_re = r'\??[^#]*' # r'\??(?P<query>[^#]*)' # discarded for now
+domain_re = r'(?P<domain>[^/?#]+)'
+path_re = r'(?P<path>[^?#]*)'
+http_regex = re.compile(f'{domain_re}{path_re}{query_re}.*')
+ftp_regex = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*')
+
+class UrlError(ValueError):
+ pass
+
+class DeconstructedUrl:
+ '''Represents a deconstructed URL or URL pattern'''
+ def __init__(self, url):
+ self.url = url
+
+ match = proto_regex.match(url)
+ if not match:
+ raise UrlError(_('invalid_URL_{}').format(url))
+
+ self.proto = match.group('proto')
+ if self.proto not in ('http', 'https', 'ftp'):
+ raise UrlError(_('disallowed_protocol_{}').format(proto))
+
+ if self.proto == 'ftp':
+ match = ftp_regex.match(match.group('rest'))
+ elif self.proto in ('http', 'https'):
+ match = http_regex.match(match.group('rest'))
+
+ if not match:
+ raise UrlError(_('invalid_URL_{}').format(url))
+
+ self.domain = match.group('domain').split('.')
+ self.domain.reverse()
+ self.path = [*filter(None, match.group('path').split('/'))]
+
+class MappingItem:
+ '''
+ A mapping, together with one of its patterns, as stored in Pattern Tree.
+ '''
+ def __init__(self, pattern, mapping):
+ self.pattern = pattern
+ self.mapping = mapping
+
+ def register(self, patterns_by_proto):
+ '''
+ Make self queryable through the Pattern Tree that starts with the
+ protocols dictionary passed in the argument.
+ '''
+ deco = DeconstructedUrl(self.pattern)
+
+ domain_tree = patterns_by_proto.get(deco.proto) or PatternTreeNode()
+ patterns_by_proto[deco.proto] = domain_tree
+
+ for path_tree in domain_tree.add(deco.domain, PatternTreeNode):
+ for match_list in path_tree.add(deco.path, list):
+ match_list.append(self)
+
+class Content:
+ '''Stores serveable website content.'''
+ def __init__(self):
+ self.resources = {}
+ self.mappings = {}
+ self.licenses = {}
+ self.indexes = {}
+ self.definition_processors = {
+ 'resource': self.process_resource_or_mapping,
+ 'mapping': self.process_resource_or_mapping,
+ 'license': self.process_license
+ }
+ self.patterns_by_proto = {}
+
+ @staticmethod
+ def register_item(dict, item):
+ '''
+ Helper function used to add a versioned item definition to content
+ data structures.
+ '''
+ identifier = item['identifier']
+ versioned_item = dict.get(identifier)
+ if versioned_item is None:
+ versioned_item = VersionedContentItem()
+ dict[identifier] = versioned_item
+
+ versioned_item.register_item(item)
+
+ @staticmethod
+ def _process_copyright_and_license(definition):
+ '''Helper function used by other process_*() methods.'''
+ for field in ['copyright', 'licenses']:
+ if definition[field] == 'auto':
+ raise MyNotImplError(f'"{{field}}": "auto"',
+ definition['source_name'])
+
+ def process_resource_or_mapping(self, definition, index):
+ '''
+ Sanitizes, autocompletes and registers serveable mapping/resource
+ definition.
+ '''
+ definition['version'] = normalize_version(definition['version'])
+
+ if definition['type'] == 'resource':
+ self._process_copyright_and_license(definition)
+ definition['dependencies'] = definition.get('dependencies', [])
+ self.register_item(self.resources, definition)
+ else:
+ self.register_item(self.mappings, definition)
+
+ def process_license(self, license, index):
+ '''Sanitizes and registers serveable license definition.'''
+ identifier = license['identifier']
+ if identifier in self.licenses:
+ raise ValueError(_('license_clash_{}').format(identifier))
+
+ self.licenses[identifier] = license
+
+ def process_index(self, index, source_name):
+ '''
+ Sanitizes, autocompletes and registers data from a loaded index.json
+ file.
+ '''
+ schema_ver = normalize_version(index['schema_version'])
+ index['schema_version'] = schema_ver
+ if schema_ver != SCHEMA_VERSION:
+ raise ValueError('index_json_schema_mismatch_{found}_{required}'
+ .format(found=version_string(schema_ver),
+ required=version_string(SCHEMA_VERSION)))
+
+ if source_name in self.indexes:
+ raise ValueError(_('source_name_clash_{}').format(source_name))
+
+ index['source_name'] = source_name
+
+ self._process_copyright_and_license(index)
+
+ self.indexes[source_name] = index
+
+ for definition in index['definitions']:
+ try:
+ definition['source_name'] = source_name
+ definition['source_copyright'] = index['copyright']
+ definition['source_licenses'] = index['licenses']
+ processor = self.definition_processors[definition['type']]
+ processor(definition, index)
+ except Exception as e:
+ if current_app._pydrilla_werror:
+ raise e from None
+ logging.error(_('couldnt_load_definition_from_%s'), subdir_path,
+ exc_info=True)
+ @staticmethod
+ def all_items(versioned_items_dict):
+ '''Iterator over all registered versions of all items.'''
+ for versioned_item in versioned_items_dict.values():
+ for item in versioned_item.by_version.values():
+ yield item
+
+ def report_missing(self):
+ '''
+ Use logger to print information about items that are referenced but
+ were not loaded.
+ '''
+ def report_missing_license(object, object_type, lic):
+ if object_type == 'index':
+ logging.error(_('no_index_license_%(source)s_%(lic)s'),
+ source=object['source_name'], lic=lic)
+ return
+
+ ver_str = version_string(object['version'])
+ kwargs = {object_type: object['identifier'], ver: ver_str, lic: lic}
+ if object_type == 'resource':
+ fmt = _('no_resource_license_%(resource)s_%(ver)s_%(lic)s')
+ else:
+ fmt = _('no_mapping_license_%(mapping)s_%(ver)s_%(lic)s')
+
+ logging.error(fmt, **kwargs)
+
+ for object_type, iterable in [
+ ('index', self.indexes.values()),
+ ('resource', self.all_items(self.resources))
+ ]:
+ for object in iterable:
+ to_process = [object['licenses']]
+ licenses = []
+ while to_process:
+ term = to_process.pop()
+
+ if type(term) is str:
+ if term not in ['or', 'and'] and \
+ term not in self.licenses:
+ report_missing_license(object, object_type, lic)
+ continue
+
+ to_process.extend(term)
+
+ def report_missing_dependency(resource, dep):
+ logging.error(_('no_dep_%(resource)s_%(ver)s_%(dep)s'),
+ dep=dep, resource=resource['identifier'],
+ ver=version_string(resource['version']))
+
+ for resource in self.all_items(self.resources):
+ for dep in resource['dependencies']:
+ if dep not in self.resources:
+ report_missing_dependency(resource, dep)
+
+ def report_missing_payload(mapping, payload):
+ logging.error(_('no_payload_%(mapping)s_%(ver)s_%(payload)s'),
+ mapping=mapping['identifier'], payload=payload,
+ ver=version_string(mapping['version']))
+
+ for mapping in self.all_items(self.mappings):
+ for payload in mapping['payloads']:
+ payload = payload['payload']
+ if payload not in self.resources:
+ report_missing_payload(mapping, payload)
+
+ def finalize(self):
+ '''
+ Initialize structures needed to serve queries. Called once after all
+ data gets loaded.
+ '''
+ for dict in [self.resources, self.mappings]:
+ for versioned_item in dict.values():
+ versioned_item.known_versions.sort()
+
+ for mapping in self.all_items(self.mappings):
+ for payload in mapping['payloads']:
+ try:
+ MappingItem(pattern, mapping)\
+ .register(self.patterns_by_proto)
+ except Exception as e:
+ if current_app._pydrilla_werror:
+ raise e from None
+ logging.error(
+ _('couldnt_register_%(mapping)s_%(ver)s_%(pattern)s'),
+ mapping=mapping['identifier'], pattern=pattern,
+ ver=version_string(mapping['version'])
+ )
+
+ def find_item(self, type, identifier, ver=None):
+ '''
+ Find and return definition of the newest version of resource/mapping
+ named by identifier. If no such resource/mapping exists, return None.
+
+ If ver is specified, instead find and return definition of that version
+ of the item (or None is absent).
+ '''
+ dict = self.resources if type == 'resource' else self.mappings
+ versioned_item = dict.get(identifier)
+ if not versioned_item:
+ return None
+
+ ver = version_string(ver or versioned_item.known_versions[-1])
+
+ return versioned_item.by_version.get(ver)
+
+ def query(self, url, max=0):
+ '''
+ Return return registered patterns and mappings (available as
+ MappingItems) that match url. The maximum number of items yielded may be
+ limited by using the optional max argument. Its default value, 0, causes
+ no limit to be imposed.
+
+ If multiple versions of a mapping are applicable, only the most recent
+ is included in the result.
+ '''
+ deco = DeconstructedUrl(url)
+
+ domain_tree = self.patterns_by_proto.get(deco.proto) \
+ or PatternTreeNode()
+ for path_tree in domain_tree.search(deco.domain):
+ for item in path_tree.search(deco.path):
+ if url[-1] == '/' or item.pattern[-1] != '/':
+ yield item
+ max -= 1
+ if max == 0:
+ return
+
+def load_content_from_subdir(subdir_path, source_name, content):
+ index_path = subdir_path / 'index.json'
+ with open(index_path) as index_file:
+ index = json.loads(strip_json_comments(index_file.read()))
+
+ content.process_index(index, source_name)
+
+def load_content(path):
+ path = pathlib.Path(path)
+ if not path.is_dir():
+ raise ValueError(_('content_dir_path_not_dir'))
+
+ content = Content()
+
+ for subdir_path in path.iterdir():
+ if not subdir_path.is_dir():
+ continue
+ try:
+ load_content_from_subdir(subdir_path, subdir_path.name, content)
+ except Exception as e:
+ if current_app._pydrilla_werror:
+ raise e from None
+ logging.error(_('couldnt_load_content_from_%s'), subdir_path,
+ exc_info=True)
+
+ content.report_missing()
+ content.finalize()
+
+ return content
+
+def create_app(config_path=(here / 'config.json'), flask_config={}):
+ app = Flask(__package__)
+ app.config.update(flask_config)
+
+ language = flask_config.get('lang', 'en')
+ translation = gettext.translation('pydrilla', localedir=(here / 'locales'),
+ languages=[language])
+
+ app._pydrilla_gettext = translation.gettext
+
+ # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode
+ if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true':
+ return app
+
+ config = load_config(config_path)
+ for key in ['static_resource_uri', 'content_dir']:
+ if key not in config:
+ raise ValueError(_('config_key_absent_{}').format(key))
+
+ app._pydrilla_static_resource_uri = config['static_resource_uri']
+ app._pydrilla_werror = config.get('werror', False)
+ if 'hydrilla_parent' in config:
+ raise MyNotImplError('hydrilla_parent', config_path.name)
+ with app.app_context():
+ app._pydrilla_content = load_content(config['content_dir'])
+
+ app.register_blueprint(bp)
+
+ return app
+
+def _(text_key):
+ return current_app._pydrilla_gettext(text_key)
+
+def escaping_gettext(text_key):
+ from markupsafe import escape
+
+ return str(escape(_(text_key)))
+
+class MyEnvironment(Environment):
+ '''
+ A wrapper class around jinja2.Environment that causes GNU gettext function
+ (as '_' and '__') and url_for function to be passed to every call of each
+ template's render() method.
+ '''
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def get_template(self, *args, **kwargs):
+ template = super().get_template(*args, **kwargs)
+ old_render = template.render
+
+ def new_render(*args, **kwargs):
+ final_kwargs = {
+ '_': escaping_gettext,
+ '__': escaping_gettext,
+ 'url_for': url_for
+ }
+ final_kwargs.update(kwargs)
+
+ return old_render(*args, **final_kwargs)
+
+ template.render = new_render
+
+ return template
+
+j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False)
+
+indexpage = j2env.get_template('index.html')
+@bp.route('/')
+def index():
+ return indexpage.render(content=current_app._pydrilla_resources_map)
+
+for item_type in ['resource', 'mapping']:
+ def item(identifier):
+ ver = request.args.get('ver')
+ if ver is not None:
+ try:
+ ver = normalize_version(parse_version(ver))
+ except:
+ abort(400)
+
+ item = current_app._pydrilla_content\
+ .find_item(item_type, identifier, ver)
+ if item is None:
+ abort(404)
+
+ return json.dumps(item)
+
+ item.__name__ = item_type + 's'
+ bp.route(f'/{item_type}s/<string:identifier>')(item)