2 files changed, 143 insertions, 356 deletions
diff --git a/content/main.js b/content/main.js
index 4ae7738..8440eb5 100644
--- a/content/main.js
+++ b/content/main.js
@@ -16,7 +16,9 @@
  * IMPORT is_chrome
  * IMPORT is_mozilla
  * IMPORT start_activity_info_server
- * IMPORT modify_on_the_fly
+ * IMPORT csp_rule
+ * IMPORT is_csp_header_name
+ * IMPORT sanitize_csp_header
  * IMPORTS_END
  */
 
@@ -31,6 +33,143 @@ function accept_node(node, parent)
     parent.hachette_corresponding.appendChild(clone);
 }
 
+/*
+ * 1. When injecting some payload we need to sanitize <meta> CSP tags before
+ *    they reach the document.
+ * 2. Only <meta> tags inside <head> are considered valid by the browser and
+ *    need to be considered.
+ * 3. We want to detach <html> from document, wait until its <head> completes
+ *    loading, sanitize it and re-attach <html>.
+ * 4. Browsers are eager to add <meta>'s that appear after `</head>' but before
+ *    `<body>'. Due to this behavior the `DOMContentLoaded' event is considered
+ *    unreliable (although it could still work properly, it is just problematic
+ *    to verify).
+ * 5. We shall wait for anything to appear in or after <body> and take that as
+ *    a sign <head> has _really_ finished loading.
+ */
+
+function make_body_start_observer(DOM_element, waiting)
+{
+    const observer = new MutationObserver(() => try_body_started(waiting));
+    observer.observe(DOM_element, {childList: true});
+    return observer;
+}
+
+function try_body_started(waiting)
+{
+    const body = waiting.detached_html.querySelector("body");
+
+    if ((body && (body.firstChild || body.nextSibling)) ||
+	waiting.doc.documentElement.nextSibling) {
+	finish_waiting(waiting);
+	return true;
+    }
+
+    if (body && waiting.observers.length < 2)
+	waiting.observers.push(make_body_start_observer(body, waiting));
+}
+
+function finish_waiting(waiting)
+{
+    waiting.observers.forEach(observer => observer.disconnect());
+    waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb);
+    setTimeout(waiting.callback, 0);
+}
+
+function _wait_for_head(doc, detached_html, callback)
+{
+    const waiting = {doc, detached_html, callback, observers: []};
+    if (try_body_started(waiting))
+	return;
+
+    waiting.observers = [make_body_start_observer(detached_html, waiting)];
+    waiting.loaded_cb = () => finish_waiting(waiting);
+    doc.addEventListener("DOMContentLoaded", waiting.loaded_cb);
+}
+
+function wait_for_head(doc, detached_html)
+{
+    return new Promise(cb => _wait_for_head(doc, detached_html, cb));
+}
+
+const blocked_str = "blocked";
+
+function block_attribute(node, attr)
+{
+    /*
+     * Disabling attributes this way allows them to still be relatively
+     * easily accessed in case they contain some useful data.
+     */
+    const construct_name = [attr];
+    while (node.hasAttribute(construct_name.join("")))
+	construct_name.unshift(blocked_str);
+
+    while (construct_name.length > 1) {
+	construct_name.shift();
+	const name = construct_name.join("");
+	node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name));
+    }
+
+    node.removeAttribute(attr);
+}
+
+function sanitize_meta(meta, policy)
+{
+    const http_equiv = meta.getAttribute("http-equiv");
+    const value = meta.content;
+
+    if (!value || !is_csp_header_name(http_equiv, true))
+	return;
+
+    block_attribute(meta, "content");
+
+    if (is_csp_header_name(http_equiv, false))
+	meta.content = sanitize_csp_header({value}, policy).value;
+}
+
+function apply_hachette_csp_rules(doc, policy)
+{
+    const meta = doc.createElement("meta");
+    meta.setAttribute("http-equiv", "Content-Security-Policy");
+    meta.setAttribute("content", csp_rule(policy.nonce));
+    doc.head.append(meta);
+    /* CSP is already in effect, we can remove the <meta> now. */
+    meta.remove();
+}
+
+async function sanitize_document(doc, policy)
+{
+    /*
+     * Ensure our CSP rules are employed from the beginning. This CSP injection
+     * method is, when possible, going to be applied together with CSP rules
+     * injected using webRequest.
+     */
+    const has_own_head = doc.head;
+    if (!has_own_head)
+	doc.documentElement.prepend(doc.createElement("head"));
+
+    apply_hachette_csp_rules(doc, policy);
+
+    /* Probably not needed, but...: proceed with DOM in its initial state. */
+    if (!has_own_head)
+	doc.head.remove();
+
+    /*
+     * <html> node gets hijacked now, to be re-attached after <head> is loaded
+     * and sanitized.
+     */
+    const old_html = doc.documentElement;
+    const new_html = doc.createElement("html");
+    old_html.replaceWith(new_html);
+
+    await wait_for_head(doc, old_html);
+
+    for (const meta of old_html.querySelectorAll("head meta"))
+	sanitize_meta(meta, policy);
+
+    new_html.replaceWith(old_html);
+}
+
 if (!is_privileged_url(document.URL)) {
     const reductor =
 	  (ac, [_, sig, pol]) => ac[0] && ac || [extract_signed(sig, pol), sig];
@@ -45,18 +184,10 @@ if (!is_privileged_url(document.URL)) {
     if (signature)
 	document.cookie = `hachette-${signature}=; Max-Age=-1;`;
 
-    handle_page_actions(policy.nonce);
+    if (!policy.allow)
+	sanitize_document(document, policy);
 
-    if (!policy.allow) {
-	const old_html = document.documentElement;
-	const new_html = document.createElement("html");
-	old_html.replaceWith(new_html);
-	old_html.hachette_corresponding = new_html;
-
-	const modify_end =
-	      modify_on_the_fly(old_html, policy, {node_eater: accept_node});
-	document.addEventListener("DOMContentLoaded", modify_end);
-    }
+    handle_page_actions(policy.nonce);
 
     start_activity_info_server();
 }
diff --git a/content/sanitize_document.js b/content/sanitize_document.js
deleted file mode 100644
index 727bb6c..0000000
--- a/content/sanitize_document.js
+++ /dev/null
@@ -1,344 +0,0 @@
-/**
- * Hachette modify HTML document as it loads and reconstruct HTML code from it
- *
- * Copyright (C) 2021 Wojtek Kosior
- * Redistribution terms are gathered in the `copyright' file.
- */
-
-/*
- * IMPORTS_START
- * IMPORT gen_nonce
- * IMPORT csp_rule
- * IMPORT is_csp_header_name
- * IMPORT sanitize_csp_header
- * IMPORT sanitize_attributes
- * IMPORTS_END
- */
-
-/*
- * Functions that sanitize elements. The script blocking measures are, when
- * possible, going to be applied together with CSP rules injected using
- * webRequest.
- */
-
-const blocked = "blocked";
-
-function block_attribute(node, attr)
-{
-    /*
-     * Disabling attributed this way allows them to still be relatively
-     * easily accessed in case they contain some useful data.
-     */
-
-    const construct_name = [attr];
-    while (node.hasAttribute(construct_name.join("")))
-	construct_name.unshift(blocked);
-
-    while (construct_name.length > 1) {
-	construct_name.shift();
-	const name = construct_name.join("");
-	node.setAttribute(`${blocked}-${name}`, node.getAttribute(name));
-    }
-
-    node.removeAttribute(attr);
-}
-
-function sanitize_script(script, data)
-{
-    if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
-	script.remove();
-	script.hachette_deleted = true;
-	script.hachette_ignore = true;
-    }
-
-    if (data.policy.allow)
-	return;
-
-    block_attribute(script, "type");
-    script.setAttribute("type", "application/json");
-}
-
-function inject_csp(head, data)
-{
-    if (data.policy.allow)
-	return;
-
-    const meta = document.createElement("meta");
-    meta.setAttribute("http-equiv", "Content-Security-Policy");
-    meta.setAttribute("content", csp_rule(data.policy.nonce));
-    meta.hachette_ignore = true;
-    head.prepend(meta);
-
-    data.new_added.unshift([meta, head]);
-}
-
-function sanitize_http_equiv_csp_rule(meta, data)
-{
-    const http_equiv = meta.getAttribute("http-equiv");
-    const value = meta.content;
-
-    if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
-	return;
-
-    block_attribute(meta, "content");
-
-    if (data.policy.allow || is_csp_header_name(http_equiv, false))
-	meta.content = sanitize_csp_header({value}, data.policy).value;
-}
-
-function sanitize_node(node, data)
-{
-    if (node.tagName === "SCRIPT")
-	sanitize_script(node, data);
-
-    if (node.tagName === "HEAD")
-	inject_csp(node, data);
-
-    if (node.tagName === "META")
-	sanitize_http_equiv_csp_rule(node, data);
-
-    if (!data.policy.allow)
-	sanitize_attributes(node, data);
-}
-
-/*
- * Instead of calling writer directly with multiple small chunks of reconstruced
- * HTML code, we utilize `setTimeout()' to only have it called once,
- * asynchronously.
- */
-function do_write_callback(data)
-{
-    data.writer(data.chunks.join(""));
-    data.chunks = [];
-
-    if (data.finished && data.finisher)
-	data.finisher();
-}
-
-function do_write(chunk, data)
-{
-    data.chunks.push(chunk);
-    clearTimeout(data.write_timeout);
-    data.write_timeout = setTimeout(() => do_write_callback(data), 0);
-}
-
-const serializer = new XMLSerializer();
-
-function start_serializing_node(node, data)
-{
-    node.hachette_started = true;
-
-    if (!data.writer)
-	return;
-
-    const clone = node.cloneNode(false);
-    clone.textContent = data.uniq;
-    do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
-}
-
-function finish_serializing_node(node, data)
-{
-    const nodes_to_process = [node];
-
-    while (true) {
-	node = nodes_to_process.pop();
-	if (!node)
-	    break;
-
-	nodes_to_process.push(node, node.hachette_last_added);
-    }
-
-    while (nodes_to_process.length > 0) {
-	const node = nodes_to_process.pop();
-	node.remove();
-	node.hachette_ignore = true;
-
-	if (!data.writer)
-	    continue;
-
-	if (node.hachette_started) {
-	    node.textContent = data.uniq;
-	    do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
-	    continue;
-	}
-
-	do_write(node.outerHTML || serializer.serializeToString(node), data);
-    }
-}
-
-function process_initial_nodes(node, data)
-{
-    if (data.processed_initial_nodes)
-	return;
-
-    data.processed_initial_nodes = true;
-
-    start_serializing_node(data.html_root, data);
-
-    const new_added = [];
-    const nodes_to_process = [data.html_root];
-
-    let i = 0;
-    while (nodes_to_process.length > 0) {
-	let current = nodes_to_process.shift();
-
-	if (current.firstChild) {
-	    if (current.firstChild === node)
-		break;
-	    nodes_to_process.unshift(current.firstChild, current);
-	    new_added.push([current.firstChild, current]);
-	    continue;
-	}
-
-	while (current && !current.nextSibling)
-	    current = nodes_to_process.shift();
-
-	if (!current || current.nextSibling === node)
-	    break;
-
-	nodes_to_process.unshift(current.nextSibling);
-	new_added.push([current.nextSibling, nodes_to_process[1]]);
-    }
-
-    data.new_added.unshift(...new_added);
-}
-
-/*
- * Important! Due to some weirdness node.parentElement is not alway correct
- * in MutationRecords under Chromium. Track node relations manually.
- */
-function handle_added_node(node, true_parent, data)
-{
-    /*
-     * Functions we call here might cause new nodes to be injected or found
-     * that require processing before the one we got in function argument.
-     * We rely on those functions putting the node(s) they create/find at the
-     * very beginning of the `new_added' queue and (for created nodes) setting
-     * their `hachette_ignore' property, based on which their MutationRecord
-     * will not be processed. A function can also mark a node already in the
-     * `new_added' queue as not eligible for processing by setting its
-     * `hachette_deleted' property.
-     */
-
-    process_initial_nodes(node, data);
-
-    data.new_added.push([node, true_parent]);
-
-    while (data.new_added.length > 0) {
-	[node, true_parent] = data.new_added.shift();
-
-	if (true_parent.hachette_deleted)
-	    node.hachette_deleted = true;
-	if (node.hachette_deleted)
-	    continue;
-
-	if (!true_parent.hachette_started)
-	    start_serializing_node(true_parent, data)
-
-	if (!node.hachette_ignore)
-	    sanitize_node(node, data);
-
-	if (node.hachette_deleted)
-	    continue;
-
-	if (data.node_eater)
-	    data.node_eater(node, true_parent);
-
-	finish_serializing_node(true_parent.hachette_last_added, data);
-
-	true_parent.hachette_last_added = node;
-    }
-}
-
-function handle_mutation(mutations, data)
-{
-    /*
-     * Chromium: for an unknown reason mutation.target is not always the same as
-     * node.parentElement. The former is the correct one.
-     */
-    for (const mutation of mutations) {
-	for (const node of mutation.addedNodes) {
-	    /* Check for nodes added by ourselves. */
-	    if (mutation.target.hachette_ignore)
-		node.hachette_ignore = true;
-	    if (node.hachette_ignore)
-		continue;
-
-	    handle_added_node(node, mutation.target, data);
-	}
-    }
-}
-
-function finish_processing(data)
-{
-    process_initial_nodes(undefined, data);
-
-    /*
-     * The `finisher' callback should be called, if provided. Normally our
-     * function that performs the last write does it after seeing `finished'
-     * set to `true'. If, however, there's no `writer' callback and hence no
-     * writes to perform, we need to take care of calling `finisher' here.
-     */
-    data.finished = true;
-    handle_mutation(data.observer.takeRecords(), data);
-    data.observer.disconnect();
-
-    /*
-     * Additional whitespace that was after `</body>' gets appended to body.
-     * Although it's a minor issue, it is not what we want. There's no way to
-     * tell exactly what part of that whitespace was after `</body>' and what
-     * was before, so we just replace it with a single newline which looks good
-     * when printed.
-     */
-    const body = data.html_root.lastChild;
-    const text = body && body.tagName === "BODY" && body.lastChild;
-    if (text && text.nodeName === "#text") {
-	const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
-	text.textContent = new_content + "\n";
-    }
-
-    finish_serializing_node(data.html_root, data);
-    if (!data.writer && data.finisher)
-	setTimeout(data.finisher, 0);
-}
-
-/*
- * This function sanitizes `html_root' according to `policy'. It is capable of
- * working on an HTML document that is being written to, sanitizing new nodes
- * as they appear.
- *
- * `consumers' object may contain 3 optional callback functions: `writer',
- * `node_eater' and `finisher'. The first one, if present, is called with chunks
- * of reconstructed HTML code. The second one, if present, gets called for every
- * added node with 2 arguments: that node and its parent. The third one is
- * called at the end, after all processing has been done.
- *
- * `modify_on_the_fly()' returns a callback that should be called (with no
- * arguments) once the document of html_root has finished being written to.
- * Unfortunately, due to specifics behavior of document that has had its
- * documentElement replaced
- */
-function modify_on_the_fly(html_root, policy, consumers)
-{
-    const uniq = gen_nonce();
-    const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
-    const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
-    Object.assign(data, consumers);
-
-    var observer = new MutationObserver(m => handle_mutation(m, data));
-    observer.observe(data.html_root, {
-     	attributes: true,
-	childList: true,
-	subtree: true
-    });
-
-    data.observer = observer;
-
-    return () => finish_processing(data);
-}
-
-/*
- * EXPORTS_START
- * EXPORT modify_on_the_fly
- * EXPORTS_END
- */