From 96068ada37bfa1d7e6485551138ba36600664caf Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Sat, 20 Nov 2021 18:29:59 +0100 Subject: replace cookies with synchronous XmlHttpRequest as policy smuggling method. Note: this breaks Mozilla port of Haketilo. Synchronous XmlHttpRequest doesn't work as well there. This will be fixed with dynamically-registered content scripts later. --- common/misc.js | 2 +- common/signing.js | 74 ------------------------------------------------------- 2 files changed, 1 insertion(+), 75 deletions(-) delete mode 100644 common/signing.js (limited to 'common') diff --git a/common/misc.js b/common/misc.js index 9ffb7ff..5b0addb 100644 --- a/common/misc.js +++ b/common/misc.js @@ -49,7 +49,7 @@ function gen_nonce(length=16) function make_csp_rule(policy) { let rule = "prefetch-src 'none'; script-src-attr 'none';"; - const script_src = policy.has_payload ? + const script_src = policy.nonce !== undefined ? `'nonce-${policy.nonce}'` : "'none'"; rule += ` script-src ${script_src}; script-src-elem ${script_src};`; return rule; diff --git a/common/signing.js b/common/signing.js deleted file mode 100644 index 11cd442..0000000 --- a/common/signing.js +++ /dev/null @@ -1,74 +0,0 @@ -/** - * This file is part of Haketilo. - * - * Functions: Operations related to "signing" of data. - * - * Copyright (C) 2021 Wojtek Kosior - * Redistribution terms are gathered in the `copyright' file. - */ - -/* - * IMPORTS_START - * IMPORT sha256 - * IMPORT browser - * IMPORT is_mozilla - * IMPORTS_END - */ - -/* - * In order to make certain data synchronously accessible in certain contexts, - * Haketilo smuggles it in string form in places like cookies, URLs and headers. - * When using the smuggled data, we first need to make sure it isn't spoofed. - * For that, we use this pseudo-signing mechanism. - * - * Despite what name suggests, no assymetric cryptography is involved, as it - * would bring no additional benefits and would incur bigger performance - * overhead. Instead, we hash the string data together with some secret value - * that is supposed to be known only by this browser instance. Resulting hash - * sum plays the role of the signature. In the hash we also include current - * time. This way, even if signed data leaks (which shouldn't happen in the - * first place), an attacker won't be able to re-use it indefinitely. - * - * The secret shared between execution contexts has to be available - * synchronously. Under Mozilla, this is the extension's per-session id. Under - * Chromium, this is a dummy web-accessible-resource name that resides in the - * manifest and is supposed to be constructed by each user using a unique value - * (this is done automatically by `build.sh'). - */ - -function get_secret() -{ - if (is_mozilla) - return browser.runtime.getURL("dummy"); - - return chrome.runtime.getManifest().web_accessible_resources - .map(r => /^chromium-key-dummy-file-(.*)/.exec(r)).filter(r => r)[0][1]; -} - -function extract_signed(signature, signed_data) -{ - const match = /^([1-9][0-9]{12}|0)_(.*)$/.exec(signed_data); - if (!match) - return {fail: "bad format"}; - - const result = {time: parseInt(match[1]), data: match[2]}; - if (sign_data(result.data, result.time)[0] !== signature) - result.fail = "bad signature"; - - return result; -} - -/* - * Sign a given string for a given time. Time should be either 0 or in the range - * 10^12 <= time < 10^13. - */ -function sign_data(data, time) { - return [sha256(get_secret() + time + data), `${time}_${data}`]; -} - -/* - * EXPORTS_START - * EXPORT extract_signed - * EXPORT sign_data - * EXPORTS_END - */ -- cgit v1.2.3 From 6106c789ee818fd18240fd3f99eead598406852f Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Tue, 30 Nov 2021 19:31:49 +0100 Subject: rewrite parts of build script in awk --- CHROMIUM_exports_init.js | 3 + MOZILLA_exports_init.js | 57 +++++++++++ build.sh | 257 +++++++++++------------------------------------ common/storage_light.js | 1 + compute_scripts.awk | 196 ++++++++++++++++++++++++++++++++++++ copyright | 3 +- process_html_file.sh | 2 +- shell_utils.sh | 25 ++--- upload_amo.sh | 25 +++-- write_makefile.sh | 4 +- 10 files changed, 343 insertions(+), 230 deletions(-) create mode 100644 CHROMIUM_exports_init.js create mode 100644 MOZILLA_exports_init.js create mode 100644 compute_scripts.awk (limited to 'common') diff --git a/CHROMIUM_exports_init.js b/CHROMIUM_exports_init.js new file mode 100644 index 0000000..d2ca065 --- /dev/null +++ b/CHROMIUM_exports_init.js @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: CC0-1.0 + +window.killtheweb={is_chrome: true, browser: window.chrome}; diff --git a/MOZILLA_exports_init.js b/MOZILLA_exports_init.js new file mode 100644 index 0000000..0015f0c --- /dev/null +++ b/MOZILLA_exports_init.js @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +/** + * This file is part of Haketilo. + * + * Function: Data structure to query items by URL patterns. + * + * Copyright (C) 2021 Wojtek Kosior + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * As additional permission under GNU GPL version 3 section 7, you + * may distribute forms of that code without the copy of the GNU + * GPL normally required by section 4, provided you include this + * license notice and, in case of non-source distribution, a URL + * through which recipients can access the Corresponding Source. + * If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not + * obligated to do so. If you do not wish to do so, delete this + * exception statement from your version. + * + * As a special exception to the GPL, any HTML file which merely + * makes function calls to this code, and for that purpose + * includes it by reference shall be deemed a separate work for + * copyright law purposes. If you modify this code, you may extend + * this exception to your version of the code, but you are not + * obligated to do so. If you do not wish to do so, delete this + * exception statement from your version. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * I, Wojtek Kosior, thereby promise not to sue for violation of this file's + * license. Although I request that you do not make use this code in a + * proprietary program, I am not going to enforce this in court. + */ + +/* Polyfill for IceCat 60. */ +String.prototype.matchAll = String.prototype.matchAll || function(regex) { + if (regex.flags.search("g") === -1) + throw new TypeError("String.prototype.matchAll called with a non-global RegExp argument"); + + for (const matches = [];;) { + if (matches[matches.push(regex.exec(this)) - 1] === null) + return matches.splice(0, matches.length - 1); + } +} + +window.killtheweb={is_mozilla: true, browser: this.browser}; diff --git a/build.sh b/build.sh index ed6a141..3d3d4be 100755 --- a/build.sh +++ b/build.sh @@ -3,180 +3,70 @@ # Copyright (C) 2021 Wojtek Kosior # Redistribution terms are gathered in the `copyright' file. -handle_export_line() { - if [ "x$1" = "xEXPORTS_START" ]; then - if [ "$STATE" = "before_block" ]; then - STATE="in_block" - fi - elif [ "x$1" = "xEXPORT" ]; then - if [ "$STATE" != "in_block" ]; then - return - fi - - EXPORTCODE="${EXPORTCODE}window.killtheweb.$2 = $2;$ENDL" - - PREVIOUS_FILE="$(map_get EXPORTS $2)" - if [ "x$PREVIOUS_FILE" != "x" ]; then - errcho "export $2 present in both $PREVIOUS_FILE and $FILE" - return 1 - fi - - map_set_instr EXPORTS $2 "$FILE" - - elif [ "x$1" = "xEXPORTS_END" ]; then - if [ "$STATE" = "in_block" ]; then - STATE="after_block" - fi - fi -} - -translate_exports() { - STATE="before_block" - EXPORTCODE='' - - while read EXPORT_LINE; do - handle_export_line $EXPORT_LINE || return 1 - done - - map_set_instr EXPORTCODES $FILEKEY "$EXPORTCODE" -} - -add_exports() { - FILE="$1" - FILEKEY="$(sanitize "$FILE")" - - eval "$(grep -o 'EXPORT.\+' "$1" | translate_exports || exit 1)" -} - -handle_import_line() { - if [ "x$1" = "xIMPORTS_START" ]; then - if [ "$STATE" = "before_block" ]; then - STATE="in_block" - fi - elif [ "x$1" = "xIMPORT" ]; then - if [ "$STATE" != "in_block" ]; then - return - fi - - IMPORTCODE="${IMPORTCODE}const $2 = window.killtheweb.$2;$ENDL" - - IMPORTS="$IMPORTS $2" - - elif [ "x$1" = "xIMPORTS_END" ]; then - if [ "$STATE" = "in_block" ]; then - STATE="after_block" - fi - fi -} - -translate_imports() { - STATE="before_block" - IMPORTCODE='' - IMPORTS='' - - while read IMPORT_LINE; do - handle_import_line $IMPORT_LINE || return 1 - done - - map_set_instr IMPORTCODES $FILEKEY "$IMPORTCODE" - map_set_instr IMPORTS $FILEKEY "$IMPORTS" -} - -add_imports() { - FILE="$1" - FILEKEY="$(sanitize "$FILE")" - - eval "$(grep -o 'IMPORT.\+' "$1" | translate_imports || exit 1)" -} +set -e -compute_scripts_list_rec() { - local FILE="$1" - local FILEKEY=$(sanitize "$1") - - local FILESTATE="$(map_get FILESTATES $FILEKEY)" - if [ "xprocessed" = "x$FILESTATE" ]; then - return - fi - if [ "xprocessing" = "x$FILESTATE" ]; then - errcho "import loop on $FILE" - return 1 - fi - - USED="$USED $FILEKEY" - - map_set FILESTATES $FILEKEY "processing" - - local IMPORT - for IMPORT in $(map_get IMPORTS $FILEKEY); do - NEXT_FILE="$(map_get EXPORTS $IMPORT)" - if [ "x" = "x$NEXT_FILE" ]; then - errcho "nothing exports $IMPORT, required by $FILE" - return 1 - fi - if ! compute_scripts_list_rec "$NEXT_FILE"; then - errcho "when satisfying $IMPORT for $FILE" - return 1 - fi - done - - [ "x$FILE" = "xexports_init.js" ] || echo $FILE # exports_init.js is hardcoded to load first; the entire export system depends on it - map_set FILESTATES $FILEKEY "processed" -} - -compute_scripts_list() { - USED='' - echo COMPUTED_SCRIPTS=\"exports_init.js - compute_scripts_list_rec "$1" - echo \" - - for FILEKEY in $USED; do - map_set_instr USED $FILEKEY yes - done -} +. ./shell_utils.sh as_json_list() { while true; do if [ "x" = "x$2" ]; then - echo -n '\\n'"\t\t\"$1\""'\\n\t' + printf '\\n\t\t"%s"\\n\t' "$1" return fi - echo -n '\\n'"\t\t\"$1\"," + printf '\\n\t\t"%s",' "$1" shift done } as_html_list() { while [ "x" != "x$1" ]; do - echo -n '\\n'" " + printf '\\n ' "$1" shift done } -build_main() { - # placate importers of these, as they are exported by the yet-to-be-created exports_init.js - EXPORTS__browser=exports_init.js - EXPORTS__is_chrome=exports_init.js - EXPORTS__is_mozilla=exports_init.js +compute_scripts() { + local DIRS="$1" + local ROOT_SCRIPT="$2" + + local AVAILABLE="$(find $DIRS -name '[^.#]*.js')" + + awk -f compute_scripts.awk script_dependencies "$ROOT_SCRIPT" $AVAILABLE +} - SCRIPTDIRS='background html common content' +build_main() { + local ALL_SCRIPTDIRS='background html common content' - SCRIPTS=$(find $SCRIPTDIRS -name '[^.#]*.js') + local ALL_SCRIPTS_AVAILABLE="$(find $ALL_SCRIPTDIRS -name '[^.#]*.js')" - for SCRIPT in $SCRIPTS; do - add_exports $SCRIPT - add_imports $SCRIPT + local SCRIPT + for SCRIPT in $ALL_SCRIPTS_AVAILABLE; do + map_set SCRIPTS_UNUSED $(sanitize $SCRIPT) yes done - eval "$(compute_scripts_list background/main.js || exit 1)" - BGSCRIPTS="$(as_json_list $COMPUTED_SCRIPTS)" - eval "$(compute_scripts_list content/main.js || exit 1)" - CONTENTSCRIPTS="$(as_json_list $COMPUTED_SCRIPTS)" - eval "$(compute_scripts_list html/display-panel.js || exit 1)" - POPUPSCRIPTS="$(as_html_list $COMPUTED_SCRIPTS)" - eval "$(compute_scripts_list html/options_main.js || exit 1)" - OPTIONSSCRIPTS="$(as_html_list $COMPUTED_SCRIPTS)" + local ROOT=background/main.js + local SCRIPTS_BG="$( compute_scripts 'common/ background/' $ROOT)" + + local ROOT=content/main.js + local SCRIPTS_CONTENT="$( compute_scripts 'common/ content/' $ROOT)" + + local ROOT=html/display-panel.js + local SCRIPTS_POPUP="$( compute_scripts 'common/ html/' $ROOT)" + + local ROOT=html/options_main.js + local SCRIPTS_OPTIONS="$( compute_scripts 'common/ html/' $ROOT)" - for DIR in $(find $SCRIPTDIRS -type d); do + local BGSCRIPTS="$( as_json_list $SCRIPTS_BG )" + local CONTENTSCRIPTS="$( as_json_list $SCRIPTS_CONTENT )" + local POPUPSCRIPTS="$( as_html_list $SCRIPTS_POPUP )" + local OPTIONSSCRIPTS="$( as_html_list $SCRIPTS_OPTIONS )" + + for SCRIPT in $SCRIPTS_BG $SCRIPTS_CONTENT $SCRIPTS_POPUP $SCRIPTS_OPTIONS + do + map_del SCRIPTS_UNUSED $(sanitize $SCRIPT) + done + + for DIR in $(find $ALL_SCRIPTDIRS -type d); do mkdir -p "$BUILDDIR"/$DIR done @@ -214,53 +104,24 @@ s^_CONTENTSCRIPTS_^$CONTENTSCRIPTS^" \ sed "s^_OPTIONSSCRIPTS_^$OPTIONSSCRIPTS^" \ > "$BUILDDIR"/html/options.html - for FILE in $SCRIPTS; do + for FILE in $ALL_SCRIPTS_AVAILABLE; do FILEKEY=$(sanitize "$FILE") - if [ "xyes" != "x$(map_get USED $FILEKEY)" ]; then - errcho "WARNING! $FILE not used" + if [ "x$(map_get SCRIPTS_UNUSED $FILEKEY)" = "xyes" ]; then + printf 'WARNING! %s not used\n' "$FILE" >&2 else - (echo "\ -\"use strict\"; - -({fun: (function() { -$(map_get IMPORTCODES $FILEKEY) - -"; - -# A hack to insert the contents of default_settings.json at the appropriate location in background/main.js -if [ "$FILE" = "background/main.js" ]; then - # Uses an internal sed expression to escape and indent the JSON file for use in the external sed expression - sed 's/^ `DEFAULT SETTINGS`$/'"$(sed -E 's/([\\\&\/])/\\\1/g; s/^/ /; s/$/\\/' < default_settings.json) "/g < "$FILE" -else - cat $FILE -fi - -echo " - -$(map_get EXPORTCODES $FILEKEY) -})}).fun();") > "$BUILDDIR"/$FILE + awk -f compute_scripts.awk wrapped_code "$FILE" > "$BUILDDIR"/$FILE fi done + # A hack to insert the contents of default_settings.json at the appropriate + # location in background/main.js. Uses an internal sed expression to escape + # and indent the JSON file for use in the external sed expression. + sed -i 's/^ `DEFAULT SETTINGS`$/'"$(sed -E 's/([\\\&\/])/\\\1/g; s/^/ /; s/$/\\/' < default_settings.json) "/g "$BUILDDIR"/background/main.js + if [ "$BROWSER" = "chromium" ]; then - cat > "$BUILDDIR"/exports_init.js < "$BUILDDIR"/exports_init.js < "$MOZILLA_FILE" + printf '\n' > "$MOZILLA_FILE" done fi if [ "$BROWSER" = "mozilla" ]; then for CHROMIUM_FILE in $(find "$BUILDDIR" -name "CHROMIUM_*"); do - echo > "$CHROMIUM_FILE" + printf '\n' > "$CHROMIUM_FILE" done fi } +print_usage() { + printf 'usage: %s mozilla|chromium [source directory] [update url]\n' \ + "$0" >&2 +} + main() { if [ "x$1" = "xmozilla" -o "x$1" = "xchromium" ]; then BROWSER=$1 else - errcho "usage: $0 mozilla|chromium [source directory] [update url]" + print_usage exit 1 fi @@ -296,13 +162,12 @@ main() { mkdir "$BUILDDIR" cd "$SRCDIR" else - errcho "usage: $0 mozilla|chromium [source directory] [update url]" + print_usage exit 2 fi UPDATE_URL="$3" - . ./shell_utils.sh build_main } diff --git a/common/storage_light.js b/common/storage_light.js index 32e3b1f..246e5eb 100644 --- a/common/storage_light.js +++ b/common/storage_light.js @@ -13,6 +13,7 @@ * IMPORT raw_storage * IMPORT is_mozilla * IMPORT observables + * IMPORTS_END */ const reg_spec = new Set(["\\", "[", "]", "(", ")", "{", "}", ".", "*", "+"]); diff --git a/compute_scripts.awk b/compute_scripts.awk new file mode 100644 index 0000000..123106c --- /dev/null +++ b/compute_scripts.awk @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: CC0-1.0 +# +# Process javascript files and resolve dependencies between them +# +# This file is part of Haketilo +# +# Copyright (C) 2021, Wojtek Kosior +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the CC0 1.0 Universal License as published by +# the Creative Commons Corporation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# CC0 1.0 Universal License for more details. + +function read_file(filename, + imports_state, exports_state, line, record, result) { + imports_state = "not_started" + exports_state = "not_started" + + do { + result = (getline line < filename) + if (result < 0) { + printf "error reading %s", filename + exit 1 + } + + if (imports_state == "started" && + line ~ /^([[:space:]]*\*[[:space:]]+)?IMPORT[[:space:]]+[_a-zA-Z][_a-zA-Z0-9]*[[:space:]]*$/) { + record = line + + sub(/^([[:space:]]*\*[[:space:]]+)?IMPORT[[:space:]]+/, "", record) + sub(/([[:space:]]+$)/, "", record) + + imports[filename,++import_counts[filename]] = record + } + if (imports_state == "started" && + line ~ /^([[:space:]]*\*[[:space:]]+)?IMPORTS_END[[:space:]]*$/) + imports_state = "finished" + if (imports_state == "not_started" && + line ~ /^([[:space:]]*\*[[:space:]]+)?IMPORTS_START[[:space:]]*$/) + imports_state = "started" + + if (exports_state == "started" && + line ~ /^([[:space:]]*\*[[:space:]]+)?EXPORT[[:space:]]+[_a-zA-Z][_a-zA-Z0-9]*[[:space:]]*$/) { + record = line + + sub(/^([[:space:]]*\*[[:space:]]+)?EXPORT[[:space:]]+/, "", record) + sub(/([[:space:]]+$)/, "", record) + + if (record in exports) { + printf "ERROR: '%s' exported by both %s and %s\n", + exports[record], filename > "/dev/stderr" + } + + provides[record] = filename + exports[filename,++export_counts[filename]] = record + } + if (exports_state == "started" && + line ~ /^([[:space:]]*\*[[:space:]]+)?EXPORTS_END[[:space:]]*$/) + exports_state = "finished" + if (exports_state == "not_started" && + line ~ /^([[:space:]]*\*[[:space:]]+)?EXPORTS_START[[:space:]]*$/) + exports_state = "started" + } while (result > 0) + + if (imports_state == "started") { + printf "ERROR: Unclosed IMPORTS list in '%s'\n", filename \ + > "/dev/stderr" + exit 1 + } + + if (exports_state == "started") { + printf "ERROR: Unclosed EXPORTS list in '%s'\n", filename \ + > "/dev/stderr" + exit 1 + } + + close(filename) +} + +function print_file(filename, line) { + while ((getline line < filename) > 0) + print(line) + + close(filename) +} + +function print_imports_code(filename, i, count, import_name) { + count = import_counts[filename] + for (i = 1; i <= count; i++) { + import_name = imports[filename,i] + printf "const %s = window.killtheweb.%s;\n", import_name, import_name + } +} + +function print_exports_code(filename, i, count, export_name) { + count = export_counts[filename] + for (i = 1; i <= count; i++) { + export_name = exports[filename,i] + printf "window.killtheweb.%s = %s;\n", export_name, export_name + } +} + +function wrap_file(filename) { + print "\"use strict\";\n\n({fun: (function() {\n" + print_imports_code(filename) + printf "\n\n" + + print_file(filename) + + printf "\n\n" + print_exports_code(filename) + print "\n})}).fun();" +} + +function compute_dependencies(filename, i, count, import_name, next_file) { + if (processed[filename] == "used") + return 0 + + if (processed[filename] == "on_stack") { + printf "import loop on %s\n", filename > "/dev/stderr" + return 1 + } + + processed[filename] = "on_stack" + + count = import_counts[filename] + for (i = 1; i <= count; i++) { + import_name = imports[filename,i] + if (!(import_name in provides)) { + printf "nothing exports %s, required by %s\n", + import_name, filename > "/dev/stderr" + return 1 + } + + if (compute_dependencies(provides[import_name]) > 0) { + printf "when satisfying %s for %s\n", + import_name, filename > "/dev/stderr" + return 1 + } + } + + processed[filename] = "used" + print filename + + return 0 +} + +function print_usage() { + printf "usage: %2 compute_scripts.awk script_dependencies|wrapped_code FILENAME[...]\n", + ARGV[0] > "/dev/stderr" + exit 1 +} + +function mock_exports_init() { + provides["browser"] = "exports_init.js" + provides["is_chrome"] = "exports_init.js" + provides["is_mozilla"] = "exports_init.js" + + processed["exports_init.js"] = "used" +} + +BEGIN { + operation = ARGV[1] + + if (ARGC < 3) + print_usage() + + root_filename = ARGV[2] + + for (i = 2; i < ARGC; i++) + filenames[ARGV[i]] + + mock_exports_init() + + for (filename in filenames) { + # A filename is allowed to appear multiple times in the list. + # Let's only process it once. + if (!(filename in processed)) + read_file(filename) + processed[filename] = "not_used" + } + + if (operation == "script_dependencies") { + print("exports_init.js") + if (compute_dependencies(root_filename) > 0) + exit 1 + } else if (operation == "wrapped_code") { + wrap_file(root_filename) + } else { + print_usage() + } +} diff --git a/copyright b/copyright index 81f9966..a238d33 100644 --- a/copyright +++ b/copyright @@ -6,7 +6,8 @@ Files: * Copyright: 2021 Wojtek Kosior License: GPL-3+-javascript or Alicense-1.0 -Files: *.sh default_settings.json Makefile.in +Files: *.sh default_settings.json Makefile.in compute_scripts.awk + CHROMIUM_exports_init.js Copyright: 2021 Wojtek Kosior 2021 jahoti License: CC0 diff --git a/process_html_file.sh b/process_html_file.sh index 1ed0295..2f58cbf 100755 --- a/process_html_file.sh +++ b/process_html_file.sh @@ -12,7 +12,7 @@ FILE="$1" FILEKEY=$(sanitize "$FILE") if [ "x$(map_get HTML_FILENAMES $FILEKEY)" = "xyes" ]; then - errcho "import loop on $FILE" + printf 'import loop on %s\n' "$FILE" >&2 exit 1 fi diff --git a/shell_utils.sh b/shell_utils.sh index 5fd24ff..6d4cc76 100644 --- a/shell_utils.sh +++ b/shell_utils.sh @@ -3,21 +3,8 @@ # This file is meant to be sourced in sh. -ENDL=" -" - -# A "raw" echo, interprets neither backclash escapes nor command-line options. -# Does not emit trailing newline. -ech() { - printf %s "$*" -} - -errcho() { - echo "$@" >&2 -} - map_set_instr() { - echo "$1__$2='$3'" + printf "%s__%s='%s'" "$1" "$2" "$3" } map_set() { @@ -29,11 +16,11 @@ map_set_export() { } map_get() { - eval "echo \"\$$1__$2\"" + eval "printf %s \"\$$1__$2\"" } map_del_instr() { - echo "unset $1__$2" + printf 'unset %s__%s' "$1" "$2" } map_del() { @@ -41,18 +28,18 @@ map_del() { } sanitize() { - echo "$1" | tr /.- _ + printf %s "$1" | tr /.- _ } escape_regex_special() { - ech "$1" | sed 's/\([]\.*?{},()[-]\)/\\\1/g' + printf %s "$1" | sed 's/\([]\.*?{},()[-]\)/\\\1/g' } # Note: We don't actually parse JSON. We extract needed keys with sed regexes # which does not work in the general case but is sufficient for now. get_json_key() { local KEY_REG="$(escape_regex_special "$1")" - ech "$2" | + printf %s "$2" | sed 's/\(.*"'"$KEY_REG"'"[[:space:]]*:[[:space:]]*"\([^"]*\)"\)\?.*/\2/' | grep . | head -1 } diff --git a/upload_amo.sh b/upload_amo.sh index 115f39a..71e12ca 100755 --- a/upload_amo.sh +++ b/upload_amo.sh @@ -24,11 +24,11 @@ SECRET="$3" XPI_PATH="$4" base64url() { - ech "$1" | base64 -w 0 | tr '/+' '_-' | tr -d '=' + printf %s "$1" | base64 -w 0 | tr '/+' '_-' | tr -d '=' } sha256hmac() { - base64url "$(ech "$2" | openssl dgst -sha256 -hmac "$1" -binary -)" + base64url "$(printf %s "$2" | openssl dgst -sha256 -hmac "$1" -binary -)" } get_manifest_key() { @@ -52,8 +52,8 @@ EOF local JWT_MESSAGE=$(base64url "$JWT_HEAD").$(base64url "$JWT_PAYLOAD") local JWT_SIGNATURE=$(sha256hmac "$SECRET" "$JWT_MESSAGE") local JWT=$JWT_MESSAGE.$JWT_SIGNATURE - errcho "Using JWT: $JWT" - ech $JWT + printf "Using JWT: $JWT\n" >&2 + printf $JWT } get_extension_url() { @@ -61,19 +61,22 @@ get_extension_url() { EXTENSION_VER="$(get_manifest_key version "$XPI_PATH")" if [ -z "$EXTENSION_ID" -o -z "$EXTENSION_VER" ]; then - errcho "Couldn't extract extension id and version. Please check if $XPI_PATH contains proper manifest.json file." + printf "Couldn't extract extension id and version. Please check if %s contains proper manifest.json file.\n" \ + "$XPI_PATH" >&2 exit 1 fi - ech "https://addons.mozilla.org/api/v4/addons/$EXTENSION_ID/versions/$EXTENSION_VER/" + printf 'https://addons.mozilla.org/api/v4/addons/%s/versions/%s/' \ + "$EXTENSION_ID" "$EXTENSION_VER" } -usage() { - errcho "Usage: $_PROG_NAME upload|check|test API_KEY SECRET XPI_PATH" +print_usage() { + printf 'Usage: %s upload|check|test API_KEY SECRET XPI_PATH\n' \ + "$_PROG_NAME" >&2 } if [ $# != 4 ]; then - usage + print_usage exit 1 fi @@ -83,7 +86,7 @@ case "$OPERATION" in test) curl "https://addons.mozilla.org/api/v4/accounts/profile/" \ -g -H "Authorization: JWT $(generate_jwt)" - echo + printf '\n' ;; check) RETURNED_DATA="$(curl $(get_extension_url) \ @@ -95,7 +98,7 @@ case "$OPERATION" in -H "Authorization: JWT $(generate_jwt)")" ;; *) - usage + print_usage exit 1 ;; esac diff --git a/write_makefile.sh b/write_makefile.sh index d5c0fa9..4011fe8 100755 --- a/write_makefile.sh +++ b/write_makefile.sh @@ -14,10 +14,10 @@ # CC0 1.0 Universal License for more details. if [ ! -e record.conf ]; then - echo "Record of configuration 'record.conf' does not exist." >&2 + printf "Record of configuration 'record.conf' does not exist.\n" >&2 exit 1 elif [ "$(head -n 1 record.conf | cut -c -9)x" != "srcdir = x" ]; then - echo "Record of configuration 'record.conf' is invalid." >&2 + printf "Record of configuration 'record.conf' is invalid.\n" >&2 exit 2 fi -- cgit v1.2.3 From 69e53743393664ed2db59bbe9bbeaf6f124754f3 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Wed, 24 Nov 2021 15:53:00 +0100 Subject: implement more efficient querying of URL patterns --- common/patterns.js | 47 ++++--- common/patterns_query_tree.js | 293 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 321 insertions(+), 19 deletions(-) create mode 100644 common/patterns_query_tree.js (limited to 'common') diff --git a/common/patterns.js b/common/patterns.js index 625be05..790ce1b 100644 --- a/common/patterns.js +++ b/common/patterns.js @@ -7,10 +7,12 @@ * Redistribution terms are gathered in the `copyright' file. */ -const MAX_URL_PATH_LEN = 12; -const MAX_URL_PATH_CHARS = 255; -const MAX_DOMAIN_LEN = 7; -const MAX_DOMAIN_CHARS = 100; +const MAX = { + URL_PATH_LEN: 12, + URL_PATH_CHARS: 255, + DOMAIN_LEN: 7, + DOMAIN_CHARS: 100 +}; const proto_regex = /^(\w+):\/\/(.*)$/; @@ -25,11 +27,17 @@ const file_regex = new RegExp(`^(${path_re}).*`); const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`); -function deconstruct_url(url) +function deconstruct_url(url, use_limits=true) { + const max = MAX; + if (!use_limits) { + for (key in MAX) + max[key] = Infinity; + } + const proto_match = proto_regex.exec(url); if (proto_match === null) - return undefined; + throw `bad url '${url}'`; const deco = {proto: proto_match[1]}; @@ -37,20 +45,21 @@ function deconstruct_url(url) deco.path = file_regex.exec(proto_match[2])[1]; } else if (deco.proto === "ftp") { [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); - } else { + } else if (deco.proto === "http" || deco.proto === "https") { const http_match = http_regex.exec(proto_match[2]); if (!http_match) return undefined; [deco.domain, deco.path, deco.query] = http_match.slice(1, 4); + } else { + throw `unsupported protocol in url '${url}'`; } - const leading_dash = deco.path[0] === "/"; deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; if (deco.domain) { - if (deco.domain.length > MAX_DOMAIN_CHARS) { + if (deco.domain.length > max.DOMAIN_CHARS) { const idx = deco.domain.indexOf(".", deco.domain.length - - MAX_DOMAIN_CHARS); + max.DOMAIN_CHARS); if (idx === -1) deco.domain = []; else @@ -59,7 +68,7 @@ function deconstruct_url(url) deco.domain_truncated = true; } - if (deco.path.length > MAX_URL_PATH_CHARS) { + if (deco.path.length > max.URL_PATH_CHARS) { deco.path = deco.path.substring(0, deco.path.lastIndexOf("/")); deco.path_truncated = true; } @@ -67,16 +76,14 @@ function deconstruct_url(url) if (typeof deco.domain === "string") { deco.domain = deco.domain.split("."); - if (deco.domain.splice(0, deco.domain.length - MAX_DOMAIN_LEN).length + if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length > 0) deco.domain_truncated = true; } deco.path = deco.path.split("/").filter(s => s !== ""); - if (deco.domain && deco.path.splice(MAX_URL_PATH_LEN).length > 0) + if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0) deco.path_truncated = true; - if (leading_dash || deco.path.length === 0) - deco.path.unshift(""); return deco; } @@ -98,13 +105,14 @@ function* each_domain_pattern(deco) function* each_path_pattern(deco) { - for (let slice = deco.path.length; slice > 0; slice--) { - const path_part = deco.path.slice(0, slice).join("/"); + for (let slice = deco.path.length; slice >= 0; slice--) { + const path_part = ["", ...deco.path.slice(0, slice)].join("/"); const path_wildcards = []; if (slice === deco.path.length && !deco.path_truncated) { - if (deco.trailing_dash) + if (deco.trailing_dash && path_part !== ) yield path_part + "/"; - yield path_part; + if (part_part !== "" || deco.proto !== "file") + yield path_part; } if (slice === deco.path.length - 1 && !deco.path_truncated && deco.path[slice] !== "*") @@ -137,5 +145,6 @@ function* each_url_pattern(url) /* * EXPORTS_START * EXPORT each_url_pattern + * EXPORT deconstruct_url * EXPORTS_END */ diff --git a/common/patterns_query_tree.js b/common/patterns_query_tree.js new file mode 100644 index 0000000..93f74c6 --- /dev/null +++ b/common/patterns_query_tree.js @@ -0,0 +1,293 @@ +/** + * This file is part of Haketilo. + * + * Function: Data structure to query items by URL patterns. + * + * Copyright (C) 2021 Wojtek Kosior + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * As additional permission under GNU GPL version 3 section 7, you + * may distribute forms of that code without the copy of the GNU + * GPL normally required by section 4, provided you include this + * license notice and, in case of non-source distribution, a URL + * through which recipients can access the Corresponding Source. + * If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not + * obligated to do so. If you do not wish to do so, delete this + * exception statement from your version. + * + * As a special exception to the GPL, any HTML file which merely + * makes function calls to this code, and for that purpose + * includes it by reference shall be deemed a separate work for + * copyright law purposes. If you modify this code, you may extend + * this exception to your version of the code, but you are not + * obligated to do so. If you do not wish to do so, delete this + * exception statement from your version. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * I, Wojtek Kosior, thereby promise not to sue for violation of this file's + * license. Although I request that you do not make use this code in a + * proprietary program, I am not going to enforce this in court. + */ + +/* + * IMPORTS_START + * IMPORT deconstruct_url + * IMPORTS_END + */ + +/* + * This is a javascript rewrite of Python implementation of the algorithm here: + * https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205 + */ + +/* "Pattern Tree" is how we refer to the data structure used for querying + * Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal + * is to make it possible for given URL to quickly retrieve all known patterns + * that match it. + */ +function make_tree_node() { + return { + wildcard_matches: [null, null, null], + literal_match: null, + children: null + }; +} + +const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0; + +/* + * Yields all matches of this segments sequence against the tree that starts at + * this node. Results are produces in order from greatest to lowest pattern + * specificity. + */ +function* search_sequence(tree_node, segments) +{ + const nodes = [tree_node]; + + for (const segment of segments) { + const next_node = nodes[nodes.length - 1].children[segment]; + if (next_node === undefined) + break; + + nodes.push(next_node); + } + + const nsegments = segments.length; + + const conds = [ + /* literal pattern match */ + () => nodes.length == nsegments, + /* wildcard pattern matches */ + () => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*", + () => nodes.length + 1 < nsegments, + () => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***" + ]; + + while (nodes.length) { + const node = nodes.pop(); + const items = [node.literal_match, ...node.wildcard_matches]; + + for (let i = 0; i < 4; i++) { + if (items[i] !== null && conds[i]()) + yield items[i]; + } + } +} + +/* + * Make item queryable through (this branch of) the Pattern Tree. If there was + * not yet any item associated with the tree path designated by segments, create + * a new one using item_instantiator() function. Return all items matching this + * path (both the ones that existed and the ones just created). + */ +function add_sequence(tree_node, segments, item_instantiator) +{ + let wildcards; + + for (var segment of segments) { + wildcards = tree_node.wildcard_matches; + + const child = tree_node.children[segment] || make_tree_node(); + tree_node.children[segment] = child; + tree_node = child; + } + + if (tree_node.literal_match === null) + tree_node.literal_match = item_instantiator(); + + if (!is_wildcard(segment)) + return [tree_node.literal_match]; + + if (wildcards[segment.length - 1] === null) + wildcards[segment.length - 1] = item_instantiator(); + + return [tree_node.literal_match, wildcards[segment.length - 1]]; +} + +/* + * Can be used to remove item from (this branch of) the Pattern Tree. + * item_modifier should be a function that accepts 1 argument, the item stored + * in the tree, and returns an item that should be used in place of the first + * one. It is also legal for it to return the same item modifying it first. If + * it returns `undefined`, it means the item should be deleted from the Tree. + * + * item_modifier() will be called with all items in (this branch of) the Pattern + * Tree that correspond to the path designated by segments. The Tree will be + * updated accordingly. + */ +function modify_sequence(tree_node, segments, item_modifier) +{ + const nodes = [tree_node]; + + for (const segment of segments) { + const next_node = nodes[nodes.length - 1].children[segment]; + nodes.push(next_node); + } + + const nsegments = segments.length; + let i = segments.length - 1; + + if (nsegments > 0 && is_wildcard(segments[i])) { + const node = nodes[nodes.length - 2]; + const asterisks = segments[i].length; + } + + ............................ + + for (let i = segments.length - 1; i >= 0; i--) { + + if (i + } + + segments + + const conds = [ + /* literal pattern match */ + () => nodes.length == nsegments, + /* wildcard pattern matches */ + () => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*", + () => nodes.length + 1 < nsegments, + () => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***" + ]; + + while (nodes.length) { + const node = nodes.pop(); + const items = [node.literal_match, ...node.wildcard_matches]; + + for (let i = 0; i < 4; i++) { + if (items[i] !== null && conds[i]()) + yield items[i]; + } + } + ................. +} + +/* + * Make item queryable through the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. + */ +function pattern_tree_register(patterns_by_proto, pattern, item_name, item) +{ + /* + * We pass 'false' to disable length limits on URL parts. Length limits are + * mostly useful in case of iteration over all patterns matching given URL. + * Here we don't do that. + */ + const deco = deconstruct_url(pattern, false); + + const tree = patterns_by_proto[deco.proto] || make_tree_node(); + patterns_by_proto[deco.proto] = tree; + + let path_trees; + + if (deco.proto === "file") { + path_trees = [tree]; + } else { + /* We need an aray of domain labels ordered most-significant-first. */ + const domain = [...deco.domain].reverse(); + path_trees = add_sequence(tree, domain, make_tree_node); + } + + for (const path_tree of path_trees) { + for (const match_obj of add_sequence(path_tree, deco.path, () => {})) + match_obj[item_name] = item; + } +} + +/* + * Remove registered item from the Pattern Tree that starts with the protocols + * dictionary object passed in the first argument. The remaining 2 arguments + * should be pattern and name that have been earlier passed to + * pattern_tree_register(). + */ +function pattern_tree_deregister(patterns_by_proto, pattern, item_name) +{ + const deco = deconstruct_url(pattern, false); + + const tree = patterns_by_proto[deco.proto] || make_tree_node(); + patterns_by_proto[deco.proto] = tree; + + let path_trees; + + if (deco.proto === "file") { + path_trees = [tree]; + } else { + /* We need an aray of domain labels ordered most-significant-first. */ + const domain = [...deco.domain].reverse(); + .......................... + path_trees = add_sequence(tree, domain, make_tree_node); + } +} + +/* + * Yield registered items (as [item_name, item] arrays) that match url. For + * cosistent behavior, please don't modify the pattern tree while iterating over + * the results. + */ +function* pattern_tree_search(patterns_by_proto, url) +{ + const deco = deconstruct_url(url, false); + + tree = patterns_by_proto[deco.proto] || make_tree_node(); + + let path_trees; + + if (deco.proto === "file") { + path_trees = [tree]; + } else { + /* We need an aray of domain labels ordered most-significant-first. */ + const domain = [...deco.domain].reverse(); + path_trees = search_sequence(tree, domain); + } + + for (const path_tree of path_trees) { + for (const match_obj of search_sequence(path_tree, deco.path)) { + for (const entry of Object.entries(match_obj)) + yield entry; + } + } +} + +const patterns_tree = { + make: () => {}, + register: patterns_tree_register, + search: patterns_tree_search +} + +/* + * EXPORTS_START + * EXPORT patterns_tree + * EXPORTS_END + */ -- cgit v1.2.3