From f4edcbe7f4739d6f82a2e1bb180960b003b30862 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 19 Nov 2021 10:32:31 +0100 Subject: fill served definitions with computed sha256 sums of files --- src/pydrilla/pydrilla.py | 124 ++++++++++++++++++++---------- src/test/example_content/hello/index.json | 21 ++--- src/test/test_pydrilla.py | 17 ++++ 3 files changed, 112 insertions(+), 50 deletions(-) diff --git a/src/pydrilla/pydrilla.py b/src/pydrilla/pydrilla.py index b0a5974..9e697ba 100644 --- a/src/pydrilla/pydrilla.py +++ b/src/pydrilla/pydrilla.py @@ -28,7 +28,7 @@ from flask import Flask, Blueprint, current_app, url_for, abort, request, \ redirect from jinja2 import Environment, PackageLoader import re -#from hashlib import sha256 +from hashlib import sha256 import os import pathlib import json @@ -333,17 +333,52 @@ class MappingItem: class Content: '''Stores serveable website content.''' - def __init__(self): + def __init__(self, content_dir_path): + ''' + When an instance of Content is constructed, it searches + content_dir_path for custom serveable site content and loads it. + ''' self.resources = {} self.mappings = {} self.licenses = {} self.indexes = {} self.definition_processors = { - 'resource': self.process_resource_or_mapping, - 'mapping': self.process_resource_or_mapping, - 'license': self.process_license + 'resource': self._process_resource_or_mapping, + 'mapping': self._process_resource_or_mapping, + 'license': self._process_license } self.patterns_by_proto = {} + self.file_sha256sums = {} + + self.content_dir_path = pathlib.Path(content_dir_path).resolve() + + if not self.content_dir_path.is_dir(): + raise ValueError(_('content_dir_path_not_dir')) + + for subdir_path in self.content_dir_path.iterdir(): + if not subdir_path.is_dir(): + continue + try: + self._load_content_from_subdir(subdir_path, subdir_path.name) + except Exception as e: + if current_app._pydrilla_werror: + raise e from None + logging.error(_('couldnt_load_content_from_%s'), subdir_path, + exc_info=True) + + self._report_missing() + self._finalize() + + def _load_content_from_subdir(self, subdir_path, source_name): + ''' + Helper function used to load definitions from index.json of a + subdirectory of the content direcotory. + ''' + index_path = subdir_path / 'index.json' + with open(index_path) as index_file: + index = json.loads(strip_json_comments(index_file.read())) + + self._process_index(index, source_name) @staticmethod def register_item(dict, item): @@ -361,13 +396,38 @@ class Content: @staticmethod def _process_copyright_and_license(definition): - '''Helper function used by other process_*() methods.''' + '''Helper function used by other _process_*() methods.''' for field in ['copyright', 'licenses']: if definition[field] == 'auto': raise MyNotImplError(f'"{{field}}": "auto"', definition['source_name']) - def process_resource_or_mapping(self, definition, index): + def _get_file_sha256sum(self, path): + ''' + Compute sha256 of the file at path. Cache results on this Content + object. + ''' + path = path.resolve() + sha256sum = self.file_sha256sums.get(path) + + if sha256sum is None: + with open(path, mode='rb') as hashed_file: + sha256sum = sha256(hashed_file.read()).digest().hex() + self.file_sha256sums[path] = sha256sum + + return sha256sum + + def _add_file_sha256sum(self, source_name, file_object): + ''' + Expect file_object to be a dict with field "file" holding a file path + relative to content directory's subdirectory source_name. Compute or + fetch from cache the sha256 sum of that file and put it in file_object's + "sha256" field. + ''' + file_path = self.content_dir_path / source_name / file_object['file'] + file_object['sha256'] = self._get_file_sha256sum(file_path) + + def _process_resource_or_mapping(self, definition, index): ''' Sanitizes, autocompletes and registers serveable mapping/resource definition. @@ -378,10 +438,13 @@ class Content: self._process_copyright_and_license(definition) definition['dependencies'] = definition.get('dependencies', []) self.register_item(self.resources, definition) + source_name = definition['source_name'] + for script in definition['scripts']: + self._add_file_sha256sum(source_name, script) else: self.register_item(self.mappings, definition) - def process_license(self, license, index): + def _process_license(self, license, index): '''Sanitizes and registers serveable license definition.''' identifier = license['identifier'] if identifier in self.licenses: @@ -389,7 +452,15 @@ class Content: self.licenses[identifier] = license - def process_index(self, index, source_name): + source_name = license['source_name'] + for legal_text in license['legal_text']: + self._add_file_sha256sum(source_name, legal_text) + + notice = license.get('notice') + if notice is not None: + self._add_file_sha256sum(source_name, notice) + + def _process_index(self, index, source_name): ''' Sanitizes, autocompletes and registers data from a loaded index.json file. @@ -429,7 +500,7 @@ class Content: for item in versioned_item.by_version.values(): yield item - def report_missing(self): + def _report_missing(self): ''' Use logger to print information about items that are referenced but were not loaded. @@ -488,7 +559,7 @@ class Content: if payload not in self.resources: report_missing_payload(mapping, payload) - def finalize(self): + def _finalize(self): ''' Initialize structures needed to serve queries. Called once after all data gets loaded. @@ -543,35 +614,6 @@ class Content: return list(mappings.values()) -def load_content_from_subdir(subdir_path, source_name, content): - index_path = subdir_path / 'index.json' - with open(index_path) as index_file: - index = json.loads(strip_json_comments(index_file.read())) - - content.process_index(index, source_name) - -def load_content(path): - if not path.is_dir(): - raise ValueError(_('content_dir_path_not_dir')) - - content = Content() - - for subdir_path in path.iterdir(): - if not subdir_path.is_dir(): - continue - try: - load_content_from_subdir(subdir_path, subdir_path.name, content) - except Exception as e: - if current_app._pydrilla_werror: - raise e from None - logging.error(_('couldnt_load_content_from_%s'), subdir_path, - exc_info=True) - - content.report_missing() - content.finalize() - - return content - def create_app(config_path=(here / 'config.json'), flask_config={}): app = Flask(__package__) app.config.update(flask_config) @@ -603,7 +645,7 @@ def create_app(config_path=(here / 'config.json'), flask_config={}): if not content_dir.is_absolute(): content_dir = config_path.parent / content_dir with app.app_context(): - app._pydrilla_content = load_content(content_dir.resolve()) + app._pydrilla_content = Content(content_dir.resolve()) app.register_blueprint(bp) diff --git a/src/test/example_content/hello/index.json b/src/test/example_content/hello/index.json index 12105c2..16843cb 100644 --- a/src/test/example_content/hello/index.json +++ b/src/test/example_content/hello/index.json @@ -150,8 +150,9 @@ // Array of javascript files that belong to this resource. "scripts": [ { - // Script name. It should also be a valid file path. - "name": "hello.js", + // Script name. It should also be a valid file path relative + // to index.json's containing directory. + "file": "hello.js", // Copyright and license info of a script file can be // specified using the same format as in the case of the // index.json file itself. If "copyright" or "license" is @@ -160,7 +161,7 @@ "copyright": "auto", "licenses": "auto" }, { - "name": "bye.js" + "file": "bye.js" } ] }, { @@ -175,7 +176,7 @@ "licenses": "CC0-1.0", // If "dependencies" is empty, it can also be omitted. // "dependencies": [], - "scripts": [{"name": "message.js"}] + "scripts": [{"file": "message.js"}] }, { "type": "mapping", @@ -262,14 +263,16 @@ // // "comment": "Expat license is the most common form of the license often called \"MIT\". Many other forms of \"MIT\" license exist. Here the name \"Expat\" is used to avoid ambiguity." - // If applicable, a "notice" can be included. It shall then be a - // path (relative to index.json) to a plain text file with that - // notice. + // If applicable, a "notice" can be included. It shall then be an + // object with "file" field containing a path (relative to + // index.json's directory) to a plain text file with that notice. // - // "notice": "license-notice.txt" + // "notice": { + // "file": "license-notice.txt" + // } // // This is needed for example in case of GNU licenses (both with and - // without exceptions). For example, + // without exceptions). For instance, // "GPL-3.0-or-later-with-html-exception" could have the following // in its notice file: // diff --git a/src/test/test_pydrilla.py b/src/test/test_pydrilla.py index 22022ae..50757a7 100644 --- a/src/test/test_pydrilla.py +++ b/src/test/test_pydrilla.py @@ -28,6 +28,7 @@ import pytest import sys import shutil from pathlib import Path +from hashlib import sha256 from os import mkdir, unlink, environ import json from markupsafe import escape @@ -37,6 +38,7 @@ from pydrilla import pydrilla, create_app test_dir = Path(__file__).resolve().parent packages_dir = test_dir.parent development_config_path = test_dir / 'development_config.json' +example_content_dir = test_dir / 'example_content' @pytest.fixture def client(): @@ -51,6 +53,11 @@ def development_config(): yield json.loads(pydrilla.strip_json_comments(config_file.read())) def test_api_basic(client, development_config): + def verify_sha256sum(source_name, file_object): + with open(example_content_dir / source_name / file_object['file'], + mode='rb') as file: + assert sha256(file.read()).digest().hex() == file_object['sha256'] + response = client.get('/') assert b'html' in response.data sources_uri = development_config['hydrilla_sources_uri'] @@ -63,6 +70,11 @@ def test_api_basic(client, development_config): assert definition['type'] == item_type assert definition['source_name'] == 'hello' assert definition['version'] == [2021, 11, 10] + if item_type == 'resource': + assert type(definition['scripts']) is list + assert len(definition['scripts']) > 0 + for script_file in definition['scripts']: + verify_sha256sum(definition['source_name'], script_file) response = client.get(f'/{item_type}s/helloapple?ver=2021.11.10.0') assert response.status_code == 200 @@ -91,6 +103,11 @@ def test_api_basic(client, development_config): assert definition['long_name'] == 'Creative Commons Zero v1.0 Universal' assert definition['source_name'] == 'hello' + assert type(definition['legal_text']) is list + assert len(definition['legal_text']) > 0 + for license_file in definition['legal_text']: + verify_sha256sum(definition['source_name'], license_file) + response = client.get('/licenses/random-bad-identifier') assert response.status_code == 404 -- cgit v1.2.3