browser-extension - A Web Extension to facilitate replacing sites' js with user-supplied scripts

/**
 * This file is part of Haketilo.
 *
 * Function: Main content script that runs in all frames.
 *
 * Copyright (C) 2021 Wojtek Kosior
 * Copyright (C) 2021 jahoti
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * As additional permission under GNU GPL version 3 section 7, you
 * may distribute forms of that code without the copy of the GNU
 * GPL normally required by section 4, provided you include this
 * license notice and, in case of non-source distribution, a URL
 * through which recipients can access the Corresponding Source.
 * If you modify file(s) with this exception, you may extend this
 * exception to your version of the file(s), but you are not
 * obligated to do so. If you do not wish to do so, delete this
 * exception statement from your version.
 *
 * As a special exception to the GPL, any HTML file which merely
 * makes function calls to this code, and for that purpose
 * includes it by reference shall be deemed a separate work for
 * copyright law purposes. If you modify this code, you may extend
 * this exception to your version of the code, but you are not
 * obligated to do so. If you do not wish to do so, delete this
 * exception statement from your version.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * I, Wojtek Kosior, thereby promise not to sue for violation of this file's
 * license. Although I request that you do not make use this code in a
 * proprietary program, I am not going to enforce this in court.
 */

/*
 * IMPORTS_START
 * IMPORT handle_page_actions
 * IMPORT gen_nonce
 * IMPORT is_privileged_url
 * IMPORT browser
 * IMPORT is_chrome
 * IMPORT is_mozilla
 * IMPORT start_activity_info_server
 * IMPORT make_csp_rule
 * IMPORT csp_header_regex
 * IMPORT report_settings
 * IMPORTS_END
 */

document.content_loaded = document.readyState === "complete";
const wait_loaded = e => e.content_loaded ? Promise.resolve() :
      new Promise(c => e.addEventListener("DOMContentLoaded", c, {once: true}));

wait_loaded(document).then(() => document.content_loaded = true);

/*
 * In the case of HTML documents:
 * 1. When injecting some payload we need to sanitize <meta> CSP tags before
 *    they reach the document.
 * 2. Only <meta> tags inside <head> are considered valid by the browser and
 *    need to be considered.
 * 3. We want to detach <html> from document, wait until its <head> completes
 *    loading, sanitize it and re-attach <html>.
 * 4. We shall wait for anything to appear in or after <body> and take that as
 *    a sign <head> has finished loading.
 * 5. Otherwise, getting the `DOMContentLoaded' event on the document shall also
 *    be a sign that <head> is fully loaded.
 */

function make_body_start_observer(DOM_element, waiting)
{
    const observer = new MutationObserver(() => try_body_started(waiting));
    observer.observe(DOM_element, {childList: true});
    return observer;
}

function try_body_started(waiting)
{
    const body = waiting.detached_html.querySelector("body");

    if ((body && (body.firstChild || body.nextSibling)) ||
	waiting.doc.documentElement.nextSibling) {
	finish_waiting(waiting);
	return true;
    }

    if (body && waiting.observers.length < 2)
	waiting.observers.push(make_body_start_observer(body, waiting));
}

function finish_waiting(waiting)
{
    if (waiting.finished)
	return;
    waiting.finished = true;
    waiting.observers.forEach(observer => observer.disconnect());
    setTimeout(waiting.callback, 0);
}

function _wait_for_head(doc, detached_html, callback)
{
    const waiting = {doc, detached_html, callback, observers: []};

    if (try_body_started(waiting))
	return;

    waiting.observers = [make_body_start_observer(detached_html, waiting)];

    wait_loaded(doc).then(() => finish_waiting(waiting));
}

function wait_for_head(doc, detached_html)
{
    return new Promise(cb => _wait_for_head(doc, detached_html, cb));
}

const blocked_str = "blocked";

function block_attribute(node, attr, ns=null)
{
    const [hasa, geta, seta, rema] = ["has", "get", "set", "remove"]
	  .map(m => (n, ...args) => typeof ns === "string" ?
	       n[`${m}AttributeNS`](ns, ...args) : n[`${m}Attribute`](...args));
    /*
     * Disabling attributes by prepending `-blocked' allows them to still be
     * relatively easily accessed in case they contain some useful data.
     */
    const construct_name = [attr];
    while (hasa(node, construct_name.join("")))
	construct_name.unshift(blocked_str);

    while (construct_name.length > 1) {
	construct_name.shift();
	const name = construct_name.join("");
	seta(node, `${blocked_str}-${name}`, geta(node, name));
    }

    rema(node, attr);
}

/*
 * Used to disable `<script>'s and `<meta>'s that have not yet been added to
 * live DOM (doesn't work for those already added).
 */
function sanitize_meta(meta)
{
    if (csp_header_regex.test(meta.httpEquiv) && meta.content)
	block_attribute(meta, "content");
}

function sanitize_script(script)
{
    script.haketilo_blocked_type = script.getAttribute("type");
    script.type = "text/plain";
}

/*
 * Executed after `<script>' has been connected to the DOM, when it is no longer
 * eligible for being executed by the browser.
 */
function desanitize_script(script)
{
    script.setAttribute("type", script.haketilo_blocked_type);

    if ([null, undefined].includes(script.haketilo_blocked_type))
	script.removeAttribute("type");

    delete script.haketilo_blocked_type;
}

const bad_url_reg = /^data:([^,;]*ml|unknown-content-type)/i;
function sanitize_urls(element)
{
    for (const attr of [...element.attributes || []]
	       .filter(attr => /^(href|src|data)$/i.test(attr.localName))
	       .filter(attr => bad_url_reg.test(attr.value)))
	block_attribute(element, attr.localName, attr.namespaceURI);
}

function start_data_urls_sanitizing(doc)
{
    doc.querySelectorAll("*[href], *[src], *[data]").forEach(sanitize_urls);
    if (!doc.content_loaded) {
	const mutation_handler = m => m.addedNodes.forEach(sanitize_urls);
	const mo = new MutationObserver(ms => ms.forEach(mutation_handler));
	mo.observe(doc, {childList: true, subtree: true});
	wait_loaded(doc).then(() => mo.disconnect());
    }
}

/*
 * Normally, we block scripts with CSP. However, Mozilla does optimizations that
 * cause part of the DOM to be loaded when our content scripts get to run. Thus,
 * before the CSP rules we inject (for non-HTTP pages) become effective, we need
 * to somehow block the execution of `<script>'s and intrinsics that were
 * already there. Additionally, some browsers (IceCat 60) seem to have problems
 * applying this CSP to non-inline `<scripts>' in certain scenarios.
 */
function prevent_script_execution(event)
{
    if (!event.target.haketilo_payload)
	event.preventDefault();
}

function mozilla_initial_block(doc)
{
    doc.addEventListener("beforescriptexecute", prevent_script_execution);

    for (const elem of doc.querySelectorAll("*")) {
	[...elem.attributes].map(attr => attr.localName)
	    .filter(attr => /^on/.test(attr) && elem.wrappedJSObject[attr])
	    .forEach(attr => elem.wrappedJSObject[attr] = null);
    }
}

/*
 * Here we block all scripts of a document which might be either and
 * HTMLDocument or an XMLDocument. Modifying an XML document might disrupt
 * Mozilla's XML preview. This is an unfortunate thing we have to accept for
 * now. XML documents *have to* be sanitized as well because they might
 * contain `<script>' tags (or on* attributes) with namespace declared as
 * "http://www.w3.org/1999/xhtml" or "http://www.w3.org/2000/svg" which allows
 * javascript execution.
 */
async function sanitize_document(doc, policy)
{
    /*
     * Blocking of scripts that are in the DOM from the beginning. Needed for
     * Mozilla.
     */
    if (is_mozilla)
	mozilla_initial_block(doc);

    /*
     * Ensure our CSP rules are employed from the beginning. This CSP injection
     * method is, when possible, going to be applied together with CSP rules
     * injected using webRequest.
     * Using elements namespaced as HTML makes this CSP injection also work for
     * non-HTML documents.
     */
    const html = new DOMParser().parseFromString(`<html><head><meta \
http-equiv="Content-Security-Policy" content="${make_csp_rule(policy)}"\
/></head><body>Loading...</body></html>`, "text/html").documentElement;

    /*
     * Root node gets hijacked now, to be re-attached after <head> is loaded
     * and sanitized.
     */
    const root = doc.documentElement;
    root.replaceWith(html);

    /*
     * When we don't inject payload, we neither block document's CSP `<meta>'
     * tags nor wait for `<head>' to be parsed.
     */
    if (policy.has_payload) {
	await wait_for_head(doc, root);

	root.querySelectorAll("head meta")
	    .forEach(m => sanitize_meta(m, policy));
    }

    root.querySelectorAll("script").forEach(s => sanitize_script(s, policy));
    html.replaceWith(root);
    root.querySelectorAll("script").forEach(s => desanitize_script(s, policy));

    start_data_urls_sanitizing(doc);
}

async function _disable_service_workers()
{
    if (!navigator.serviceWorker)
	return;

    const registrations = await navigator.serviceWorker.getRegistrations();
    if (registrations.length === 0)
	return;

    console.warn("Service Workers detected on this page! Unregistering and reloading.");

    try {
	await Promise.all(registrations.map(r => r.unregister()));
    } finally {
	location.reload();
    }

    /* Never actually return! */
    return new Promise(() => 0);
}

/*
 * Trying to use servce workers APIs might result in exceptions, for example
 * when in a non-HTML document. Because of this, we wrap the function that does
 * the actual work in a try {} block.
 */
async function disable_service_workers()
{
    try {
	await _disable_service_workers()
    } catch (e) {
	console.debug("Exception thrown during an attempt to detect and disable service workers.", e);
    }
}

function synchronously_get_policy(url)
{
    const encoded_url = encodeURIComponent(url);
    const request_url = `${browser.runtime.getURL("dummy")}?url=${encoded_url}`;

    try {
	var xhttp = new XMLHttpRequest();
	xhttp.open("GET", request_url, false);
	xhttp.send();
    } catch(e) {
	console.error("Failure to synchronously fetch policy for url.", e);
	return {allow: false};
    }

    const policy = /^[^?]*\?settings=(.*)$/.exec(xhttp.responseURL)[1];
    return JSON.parse(decodeURIComponent(policy));
}

if (!is_privileged_url(document.URL)) {
    const policy = synchronously_get_policy(document.URL);

    if (!(document instanceof HTMLDocument))
	delete policy.payload;

    console.debug("current policy", policy);

    report_settings(policy);

    policy.nonce = gen_nonce();

    const doc_ready = Promise.all([
	policy.allow ? Promise.resolve() : sanitize_document(document, policy),
	policy.allow ? Promise.resolve() : disable_service_workers(),
	wait_loaded(document)
    ]);

    handle_page_actions(policy, doc_ready);

    start_activity_info_server();
}