aboutsummaryrefslogtreecommitdiff
path: root/src/pydrilla/pydrilla.py
diff options
context:
space:
mode:
authorWojtek Kosior <koszko@koszko.org>2022-02-08 15:29:49 +0100
committerWojtek Kosior <koszko@koszko.org>2022-02-09 15:34:46 +0100
commit6676b4ed90e19e2fd6ee5f4242cf85f64db145d8 (patch)
tree42b45c6ed731abeab85e160b020bc57cab638fff /src/pydrilla/pydrilla.py
parent67631e6c5db6739f7a57958d222e5af7ebc364b0 (diff)
downloadhaketilo-hydrilla-6676b4ed90e19e2fd6ee5f4242cf85f64db145d8.tar.gz
haketilo-hydrilla-6676b4ed90e19e2fd6ee5f4242cf85f64db145d8.zip
rework Hydrilla to use a separate tool for building its source packages
* Hydrilla now depends on "Hydrilla builder" developed at: https://git.koszko.org/hydrilla-builder/ * Hydrilla repository is now REUSE-compliant * The debian packaging is temporarily not tested and likely to be broken * JSON schemas are now in use (through 'jsonschema' Python library) * This is not yet a release and some minor changes to the API on-fisk format are going to occur before that
Diffstat (limited to 'src/pydrilla/pydrilla.py')
-rw-r--r--src/pydrilla/pydrilla.py755
1 files changed, 0 insertions, 755 deletions
diff --git a/src/pydrilla/pydrilla.py b/src/pydrilla/pydrilla.py
deleted file mode 100644
index d7aef76..0000000
--- a/src/pydrilla/pydrilla.py
+++ /dev/null
@@ -1,755 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-# Main repository logic.
-#
-# This file is part of Hydrilla
-#
-# Copyright (C) 2021 Wojtek Kosior
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <https://www.gnu.org/licenses/>.
-#
-#
-# I, Wojtek Kosior, thereby promise not to sue for violation of this
-# file's license. Although I request that you do not make use this code
-# in a proprietary program, I am not going to enforce this in court.
-
-from flask import Flask, Blueprint, current_app, url_for, abort, request, \
- redirect
-from jinja2 import Environment, PackageLoader
-import re
-from hashlib import sha256
-import os
-import pathlib
-import json
-import gettext
-import logging
-
-SCHEMA_VERSION = [0, 2]
-
-strip_comment_re = re.compile(r'''
-^ # match from the beginning of each line
-( # catch the part before '//' comment
- (?: # this group matches either a string or a single out-of-string character
- [^"/] |
- "
- (?: # this group matches any in-a-string character
- [^"\\] | # match any normal character
- \\[^u] | # match any escaped character like '\f' or '\n'
- \\u[a-fA-F0-9]{4} # match an escape
- )*
- "
- )*
-)
-# expect either end-of-line or a comment:
-# * unterminated strings will cause matching to fail
-# * bad comment (with '/' instead of '//') will be indicated by second group
-# having length 1 instead of 2 or 0
-(//?|$)
-''', re.VERBOSE)
-
-def strip_json_comments(text):
- processed = 0
- stripped_text = []
- for line in text.split('\n'):
- match = strip_comment_re.match(line)
-
- if match is None: # unterminated string
- # ignore this error, let json module report it
- stripped = line
- elif len(match[2]) == 1:
- raise json.JSONDecodeError('bad comment', text,
- processed + len(match[1]))
- else:
- stripped = match[1]
-
- stripped_text.append(stripped)
- processed += len(line) + 1
-
- return '\n'.join(stripped_text)
-
-here = pathlib.Path(__file__).resolve().parent
-
-bp = Blueprint('bp', __package__)
-
-def load_config(config_path):
- config = {}
- to_load = [config_path]
- failures_ok = [False]
-
- while to_load:
- path = to_load.pop()
- can_fail = failures_ok.pop()
-
- try:
- with open(config_path) as config_file:
- new_config = json.loads(strip_json_comments(config_file.read()))
- except Exception as e:
- if can_fail:
- continue
- raise e from None
-
- config.update(new_config)
-
- for key, failure_ok in [('try_configs', True), ('use_configs', False)]:
- paths = new_config.get(key, [])
- paths.reverse()
- to_load.extend(paths)
- failures_ok.extend([failure_ok] * len(paths))
-
- for key in ['try_configs', 'use_configs']:
- if key in config:
- config.pop(key)
-
- return config
-
-def get_content_file_path(path):
- if os.path.sep != '/':
- path.replace('/', os.path.sep)
-
- path = pathlib.Path(path)
- if path.is_absolute():
- raise ValueError(_('path_is_absolute_{}').format(path))
-
- return path
-
-class MyNotImplError(NotImplementedError):
- '''Raised when a planned but not-yet-completed feature is used.'''
- def __init__(self, what, where):
- super().__init__(_('not_implemented_{what}_{where}')
- .format(what=what, where=where))
-
-def normalize_version(ver):
- '''
- ver is an array of integers. Strip right-most zeroes from ver.
-
- Returns a *new* array. Doesn't modify its argument.
- '''
- new_len = 0
- for i, num in enumerate(ver):
- if num != 0:
- new_len = i + 1
-
- return ver[:new_len]
-
-def parse_version(ver_str):
- '''
- Convert ver_str into an array representation, e.g. for ver_str="4.6.13.0"
- return [4, 6, 13, 0].
- '''
- return [int(num) for num in ver_str.split('.')]
-
-def version_string(ver, rev=None):
- '''
- ver is an array of integers. rev is an optional integer. Produce string
- representation of version (optionally with revision number), like:
- 1.2.3-5
- No version normalization is performed.
- '''
- return '.'.join([str(n) for n in ver]) + ('' if rev is None else f'-{rev}')
-
-class VersionedContentItem:
- '''Stores definitions of multiple versions of website content item.'''
- def __init__(self):
- self.uuid = None
- self.identifier = None
- self.by_version = {}
- self.known_versions = []
-
- def register_item(self, item):
- '''Make item queryable by version. Perform sanity checks for uuid.'''
- if self.identifier is None:
- self.identifier = item['identifier']
- self.uuid = item['uuid']
- elif self.uuid != item['uuid']:
- raise ValueError(_('uuid_mismatch_{identifier}')
- .format(identifier=self.identifier))
-
- ver = item['version']
- ver_str = version_string(ver)
-
- if ver_str in self.by_version:
- raise ValueError(_('version_clash_{identifier}_{version}')
- .format(identifier=self.identifier,
- version=ver_str))
-
- self.by_version[ver_str] = item
- self.known_versions.append(ver)
-
- def get_by_ver(self, ver=None):
- '''
- Find and return definition of the newest version of item.
-
- If ver is specified, instead find and return definition of that version
- of the item (or None is absent).
- '''
- ver = version_string(ver or self.known_versions[-1])
-
- return self.by_version.get(ver)
-
- def get_all(self):
- '''Return a list of all definitions of item, ordered by version.'''
- return [self.by_version[version_string(ver)]
- for ver in self.known_versions]
-
-class PatternTreeNode:
- '''
- "Pattern Tree" is how we refer to the data structure used for querying
- Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
- is to make it possible for given URL to quickly retrieve all known patterns
- that match it.
- '''
- def __init__(self):
- self.wildcard_matches = [None, None, None]
- self.literal_match = None
- self.children = {}
-
- def search(self, segments):
- '''
- Yields all matches of this segments sequence against the tree that
- starts at this node. Results are produces in order from greatest to
- lowest pattern specificity.
- '''
- nodes = [self]
-
- for segment in segments:
- next_node = nodes[-1].children.get(segment)
- if next_node is None:
- break
-
- nodes.append(next_node)
-
- nsegments = len(segments)
- cond_literal = lambda: len(nodes) == nsegments
- cond_wildcard = [
- lambda: len(nodes) + 1 == nsegments and segments[-1] != '*',
- lambda: len(nodes) + 1 < nsegments,
- lambda: len(nodes) + 1 != nsegments or segments[-1] != '***'
- ]
-
- while nodes:
- node = nodes.pop()
-
- for item, condition in [(node.literal_match, cond_literal),
- *zip(node.wildcard_matches, cond_wildcard)]:
- if item is not None and condition():
- yield item
-
- def add(self, segments, item_instantiator):
- '''
- Make item queryable through (this branch of) the Pattern Tree. If there
- was not yet any item associated with the tree path designated by
- segments, create a new one using item_instantiator() function. Return
- all items matching this path (both the ones that existed and the ones
- just created).
- '''
- node = self
- segment = None
-
- for segment in segments:
- wildcards = node.wildcard_matches
-
- child = node.children.get(segment) or PatternTreeNode()
- node.children[segment] = child
- node = child
-
- if node.literal_match is None:
- node.literal_match = item_instantiator()
-
- if segment not in ('*', '**', '***'):
- return [node.literal_match]
-
- if wildcards[len(segment) - 1] is None:
- wildcards[len(segment) - 1] = item_instantiator()
-
- return [node.literal_match, wildcards[len(segment) - 1]]
-
-proto_regex = re.compile(r'^(?P<proto>\w+)://(?P<rest>.*)$')
-user_re = r'[^/?#@]+@' # r'(?P<user>[^/?#@]+)@' # discarded for now
-query_re = r'\??[^#]*' # r'\??(?P<query>[^#]*)' # discarded for now
-domain_re = r'(?P<domain>[^/?#]+)'
-path_re = r'(?P<path>[^?#]*)'
-http_regex = re.compile(f'{domain_re}{path_re}{query_re}.*')
-ftp_regex = re.compile(f'(?:{user_re})?{domain_re}{path_re}.*')
-
-class UrlError(ValueError):
- pass
-
-class DeconstructedUrl:
- '''Represents a deconstructed URL or URL pattern'''
- def __init__(self, url):
- self.url = url
-
- match = proto_regex.match(url)
- if not match:
- raise UrlError(_('invalid_URL_{}').format(url))
-
- self.proto = match.group('proto')
- if self.proto not in ('http', 'https', 'ftp'):
- raise UrlError(_('disallowed_protocol_{}').format(proto))
-
- if self.proto == 'ftp':
- match = ftp_regex.match(match.group('rest'))
- elif self.proto in ('http', 'https'):
- match = http_regex.match(match.group('rest'))
-
- if not match:
- raise UrlError(_('invalid_URL_{}').format(url))
-
- self.domain = match.group('domain').split('.')
- self.domain.reverse()
- self.path = [*filter(None, match.group('path').split('/'))]
-
-class MappingItem:
- '''
- A mapping, together with one of its patterns, as stored in Pattern Tree.
- '''
- def __init__(self, pattern, mapping):
- self.pattern = pattern
- self.mapping = mapping
-
- def register(self, patterns_by_proto):
- '''
- Make self queryable through the Pattern Tree that starts with the
- protocols dictionary passed in the argument.
- '''
- deco = DeconstructedUrl(self.pattern)
-
- domain_tree = patterns_by_proto.get(deco.proto) or PatternTreeNode()
- patterns_by_proto[deco.proto] = domain_tree
-
- for path_tree in domain_tree.add(deco.domain, PatternTreeNode):
- for match_list in path_tree.add(deco.path, list):
- match_list.append(self)
-
-class Content:
- '''Stores serveable website content.'''
- def __init__(self, content_dir_path):
- '''
- When an instance of Content is constructed, it searches
- content_dir_path for custom serveable site content and loads it.
- '''
- self.resources = {}
- self.mappings = {}
- self.licenses = {}
- self.indexes = {}
- self.definition_processors = {
- 'resource': self._process_resource_or_mapping,
- 'mapping': self._process_resource_or_mapping,
- 'license': self._process_license
- }
- self.patterns_by_proto = {}
- self.file_sha256sums = {}
-
- self.content_dir_path = pathlib.Path(content_dir_path).resolve()
-
- if not self.content_dir_path.is_dir():
- raise ValueError(_('content_dir_path_not_dir'))
-
- for subdir_path in self.content_dir_path.iterdir():
- if not subdir_path.is_dir():
- continue
- try:
- self._load_content_from_subdir(subdir_path, subdir_path.name)
- except Exception as e:
- if current_app._pydrilla_werror:
- raise e from None
- logging.error(_('couldnt_load_content_from_%s'), subdir_path,
- exc_info=True)
-
- self._report_missing()
- self._finalize()
-
- def _load_content_from_subdir(self, subdir_path, source_name):
- '''
- Helper function used to load definitions from index.json of a
- subdirectory of the content direcotory.
- '''
- index_path = subdir_path / 'index.json'
- with open(index_path) as index_file:
- index = json.loads(strip_json_comments(index_file.read()))
-
- self._process_index(index, source_name)
-
- @staticmethod
- def register_item(dict, item):
- '''
- Helper function used to add a versioned item definition to content
- data structures.
- '''
- identifier = item['identifier']
- versioned_item = dict.get(identifier)
- if versioned_item is None:
- versioned_item = VersionedContentItem()
- dict[identifier] = versioned_item
-
- versioned_item.register_item(item)
-
- @staticmethod
- def _process_copyright_and_license(definition):
- '''Helper function used by other _process_*() methods.'''
- for field in ['copyright', 'licenses']:
- if definition[field] == 'auto':
- raise MyNotImplError(f'"{{field}}": "auto"',
- definition['source_name'])
-
- def _get_file_sha256sum(self, path):
- '''
- Compute sha256 of the file at path. Cache results on this Content
- object.
- '''
- path = path.resolve()
- sha256sum = self.file_sha256sums.get(path)
-
- if sha256sum is None:
- with open(path, mode='rb') as hashed_file:
- sha256sum = sha256(hashed_file.read()).digest().hex()
- self.file_sha256sums[path] = sha256sum
-
- return sha256sum
-
- def _add_file_sha256sum(self, source_name, file_object):
- '''
- Expect file_object to be a dict with field "file" holding a file path
- relative to content directory's subdirectory source_name. Compute or
- fetch from cache the sha256 sum of that file and put it in file_object's
- "sha256" field.
- '''
- file_path = self.content_dir_path / source_name / file_object['file']
- file_object['sha256'] = self._get_file_sha256sum(file_path)
-
- def _process_resource_or_mapping(self, definition, index):
- '''
- Sanitizes, autocompletes and registers serveable mapping/resource
- definition.
- '''
- definition['version'] = normalize_version(definition['version'])
-
- if definition['type'] == 'resource':
- self._process_copyright_and_license(definition)
- definition['dependencies'] = definition.get('dependencies', [])
- self.register_item(self.resources, definition)
- source_name = definition['source_name']
- for script in definition['scripts']:
- self._add_file_sha256sum(source_name, script)
- else:
- self.register_item(self.mappings, definition)
-
- def _process_license(self, license, index):
- '''Sanitizes and registers serveable license definition.'''
- identifier = license['identifier']
- if identifier in self.licenses:
- raise ValueError(_('license_clash_{}').format(identifier))
-
- self.licenses[identifier] = license
-
- source_name = license['source_name']
- for legal_text in license['legal_text']:
- self._add_file_sha256sum(source_name, legal_text)
-
- notice = license.get('notice')
- if notice is not None:
- self._add_file_sha256sum(source_name, notice)
-
- def _process_index(self, index, source_name):
- '''
- Sanitizes, autocompletes and registers data from a loaded index.json
- file.
- '''
- schema_ver = normalize_version(index['schema_version'])
- index['schema_version'] = schema_ver
- if schema_ver != SCHEMA_VERSION:
- raise ValueError('index_json_schema_mismatch_{found}_{required}'
- .format(found=version_string(schema_ver),
- required=version_string(SCHEMA_VERSION)))
-
- if source_name in self.indexes:
- raise ValueError(_('source_name_clash_{}').format(source_name))
-
- index['source_name'] = source_name
-
- self._process_copyright_and_license(index)
-
- self.indexes[source_name] = index
-
- for definition in index['definitions']:
- try:
- definition['source_name'] = source_name
- definition['source_copyright'] = index['copyright']
- definition['source_licenses'] = index['licenses']
- processor = self.definition_processors[definition['type']]
- processor(definition, index)
- except Exception as e:
- if current_app._pydrilla_werror:
- raise e from None
- logging.error(_('couldnt_load_definition_from_%s'), subdir_path,
- exc_info=True)
- @staticmethod
- def all_items(versioned_items_dict):
- '''Iterator over all registered versions of all items.'''
- for versioned_item in versioned_items_dict.values():
- for item in versioned_item.by_version.values():
- yield item
-
- def _report_missing(self):
- '''
- Use logger to print information about items that are referenced but
- were not loaded.
- '''
- def report_missing_license(object, object_type, lic):
- if object_type == 'index':
- logging.error(_('no_index_license_%(source)s_%(lic)s'),
- source=object['source_name'], lic=lic)
- return
-
- ver_str = version_string(object['version'])
- kwargs = {object_type: object['identifier'], ver: ver_str, lic: lic}
- if object_type == 'resource':
- fmt = _('no_resource_license_%(resource)s_%(ver)s_%(lic)s')
- else:
- fmt = _('no_mapping_license_%(mapping)s_%(ver)s_%(lic)s')
-
- logging.error(fmt, **kwargs)
-
- for object_type, iterable in [
- ('index', self.indexes.values()),
- ('resource', self.all_items(self.resources))
- ]:
- for object in iterable:
- to_process = [object['licenses']]
- licenses = []
- while to_process:
- term = to_process.pop()
-
- if type(term) is str:
- if term not in ['or', 'and'] and \
- term not in self.licenses:
- report_missing_license(object, object_type, lic)
- continue
-
- to_process.extend(term)
-
- def report_missing_dependency(resource, dep):
- logging.error(_('no_dep_%(resource)s_%(ver)s_%(dep)s'),
- dep=dep, resource=resource['identifier'],
- ver=version_string(resource['version']))
-
- for resource in self.all_items(self.resources):
- for dep in resource['dependencies']:
- if dep not in self.resources:
- report_missing_dependency(resource, dep)
-
- def report_missing_payload(mapping, payload):
- logging.error(_('no_payload_%(mapping)s_%(ver)s_%(payload)s'),
- mapping=mapping['identifier'], payload=payload,
- ver=version_string(mapping['version']))
-
- for mapping in self.all_items(self.mappings):
- for payload in mapping['payloads']:
- payload = payload['payload']
- if payload not in self.resources:
- report_missing_payload(mapping, payload)
-
- def _finalize(self):
- '''
- Initialize structures needed to serve queries. Called once after all
- data gets loaded.
- '''
- for dict in [self.resources, self.mappings]:
- for versioned_item in dict.values():
- versioned_item.known_versions.sort()
-
- for mapping in self.all_items(self.mappings):
- for payload in mapping['payloads']:
- pattern = payload['pattern']
- try:
- MappingItem(pattern, mapping)\
- .register(self.patterns_by_proto)
- except Exception as e:
- if current_app._pydrilla_werror:
- raise e from None
- logging.error(
- _('couldnt_register_%(mapping)s_%(ver)s_%(pattern)s'),
- mapping=mapping['identifier'], pattern=pattern,
- ver=version_string(mapping['version'])
- )
-
- def query(self, url):
- '''
- Return a list of registered mappings that match url.
-
- If multiple versions of a mapping are applicable, only the most recent
- is included in the result.
- '''
- deco = DeconstructedUrl(url)
-
- mappings = {}
-
- domain_tree = self.patterns_by_proto.get(deco.proto) \
- or PatternTreeNode()
-
- def process_item(item):
- if url[-1] != '/' and item.pattern[-1] == '/':
- return
-
- identifier = item.mapping['identifier']
-
- if identifier not in mappings or \
- item.mapping['version'] > mappings[identifier]['version']:
- mappings[identifier] = item.mapping
-
- for path_tree in domain_tree.search(deco.domain):
- for item_list in path_tree.search(deco.path):
- for item in item_list:
- process_item(item)
-
- return list(mappings.values())
-
-def create_app(config_path=(here / 'config.json'), flask_config={}):
- app = Flask(__package__)
- app.config.update(flask_config)
-
- language = flask_config.get('lang', 'en')
- translation = gettext.translation('pydrilla', localedir=(here / 'locales'),
- languages=[language])
-
- app._pydrilla_gettext = translation.gettext
-
- # https://stackoverflow.com/questions/9449101/how-to-stop-flask-from-initialising-twice-in-debug-mode
- if app.debug and os.environ.get('WERKZEUG_RUN_MAIN') != 'true':
- return app
-
- config = load_config(config_path)
- for key in ['static_resource_uri', 'content_dir', 'hydrilla_sources_uri']:
- if key not in config:
- raise ValueError(_('config_key_absent_{}').format(key))
-
- app._pydrilla_static_resource_uri = config['static_resource_uri']
- if app._pydrilla_static_resource_uri[-1] != '/':
- app._pydrilla_static_resource_uri += '/'
- app._pydrilla_hydrilla_sources_uri = config['hydrilla_sources_uri']
- app._pydrilla_werror = config.get('werror', False)
- if 'hydrilla_parent' in config:
- raise MyNotImplError('hydrilla_parent', config_path.name)
-
- content_dir = pathlib.Path(config['content_dir'])
- if not content_dir.is_absolute():
- content_dir = config_path.parent / content_dir
- with app.app_context():
- app._pydrilla_content = Content(content_dir.resolve())
-
- app.register_blueprint(bp)
-
- return app
-
-def _(text_key):
- return current_app._pydrilla_gettext(text_key)
-
-def content():
- return current_app._pydrilla_content
-
-class MyEnvironment(Environment):
- '''
- A wrapper class around jinja2.Environment that causes GNU gettext function
- (as '_' and '__'), url_for function and 'hydrilla_sources_uri' config option
- to be passed to every call of each template's render() method.
- '''
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def get_template(self, *args, **kwargs):
- template = super().get_template(*args, **kwargs)
- old_render = template.render
-
- def new_render(*args, **kwargs):
- _ = current_app._pydrilla_gettext
- sources_uri = current_app._pydrilla_hydrilla_sources_uri
-
- def escaping_gettext(text_key):
- from markupsafe import escape
-
- return str(escape(_(text_key)))
-
- final_kwargs = {
- '_': escaping_gettext,
- '__': escaping_gettext,
- 'url_for': url_for,
- 'hydrilla_sources_uri' : sources_uri
- }
- final_kwargs.update(kwargs)
-
- return old_render(*args, **final_kwargs)
-
- template.render = new_render
-
- return template
-
-j2env = MyEnvironment(loader=PackageLoader(__package__), autoescape=False)
-
-indexpage = j2env.get_template('index.html')
-@bp.route('/')
-def index():
- return indexpage.render()
-
-def get_resource_or_mapping(identifier, get_dict):
- ver = request.args.get('ver')
- versioned_item = get_dict().get(identifier)
-
- if ver == 'all':
- definition = versioned_item.get_all() if versioned_item else []
- else:
- if ver is not None:
- try:
- ver = normalize_version(parse_version(ver))
- except:
- abort(400)
-
- definition = versioned_item and versioned_item.get_by_ver(ver)
- if definition is None:
- abort(404)
-
- return json.dumps(definition)
-
-def get_license_or_source(identifier, get_dict):
- definition = get_dict().get(identifier)
- if definition is None:
- abort(404)
-
- return json.dumps(definition)
-
-for item_type, get_dict, get_item in [
- ('resource', lambda: content().resources, get_resource_or_mapping),
- ('mapping', lambda: content().mappings, get_resource_or_mapping),
- ('license', lambda: content().licenses, get_license_or_source),
- ('source', lambda: content().indexes, get_license_or_source)
-]:
- def _get_item(identifier, get_dict=get_dict, get_item=get_item):
- return get_item(identifier, get_dict)
-
- bp.add_url_rule(f'/{item_type}s/<string:identifier>', item_type, _get_item)
-
-@bp.route('/query')
-def query():
- url = request.args['url']
-
- return json.dumps(content().query(url))
-
-@bp.route('/sources/<string:identifier>/<path:path>')
-def get_file(identifier, path):
- if identifier not in content().indexes:
- abort(404)
-
- new_uri = f'{current_app._pydrilla_static_resource_uri}{identifier}/{path}'
-
- return redirect(new_uri, code=301)