3 files changed, 104 insertions, 103 deletions
diff --git a/content/activity_info_server.js b/content/activity_info_server.js
index beecb1a..1b69703 100644
--- a/content/activity_info_server.js
+++ b/content/activity_info_server.js
@@ -44,9 +44,9 @@ function report_settings(settings)
     report_activity("settings", settings);
 }
 
-function report_content_type(content_type)
+function report_document_type(is_html)
 {
-    report_activity("content_type", content_type);
+    report_activity("is_html", is_html);
 }
 
 function report_repo_query_action(update, port)
@@ -96,6 +96,6 @@ function start_activity_info_server()
  * EXPORT start_activity_info_server
  * EXPORT report_script
  * EXPORT report_settings
- * EXPORT report_content_type
+ * EXPORT report_document_type
  * EXPORTS_END
  */
diff --git a/content/main.js b/content/main.js
index a183913..fb334dd 100644
--- a/content/main.js
+++ b/content/main.js
@@ -22,6 +22,12 @@
  * IMPORTS_END
  */
 
+document.content_loaded = document.readyState === "complete";
+const wait_loaded = e => e.content_loaded ? Promise.resolve() :
+      new Promise(c => e.addEventListener("DOMContentLoaded", c, {once: true}));
+
+wait_loaded(document).then(() => document.content_loaded = true);
+
 function extract_cookie_policy(cookie, min_time)
 {
     let best_result = {time: -1};
@@ -86,18 +92,17 @@ function employ_nonhttp_policy(policy)
 }
 
 /*
+ * In the case of HTML documents:
  * 1. When injecting some payload we need to sanitize <meta> CSP tags before
  *    they reach the document.
  * 2. Only <meta> tags inside <head> are considered valid by the browser and
  *    need to be considered.
  * 3. We want to detach <html> from document, wait until its <head> completes
  *    loading, sanitize it and re-attach <html>.
- * 4. Browsers are eager to add <meta>'s that appear after `</head>' but before
- *    `<body>'. Due to this behavior the `DOMContentLoaded' event is considered
- *    unreliable (although it could still work properly, it is just problematic
- *    to verify).
- * 5. We shall wait for anything to appear in or after <body> and take that as
- *    a sign <head> has _really_ finished loading.
+ * 4. We shall wait for anything to appear in or after <body> and take that as
+ *    a sign <head> has finished loading.
+ * 5. Otherwise, getting the `DOMContentLoaded' event on the document shall also
+ *    be a sign that <head> is fully loaded.
  */
 
 function make_body_start_observer(DOM_element, waiting)
@@ -123,8 +128,10 @@ function try_body_started(waiting)
 
 function finish_waiting(waiting)
 {
+    if (waiting.finished)
+	return;
+    waiting.finished = true;
     waiting.observers.forEach(observer => observer.disconnect());
-    waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb);
     setTimeout(waiting.callback, 0);
 }
 
@@ -132,19 +139,12 @@ function _wait_for_head(doc, detached_html, callback)
 {
     const waiting = {doc, detached_html, callback, observers: []};
 
-    /*
-     * For XML and SVG documents, instead of waiting for `<head>', we wait
-     * for the entire document to finish loading.
-     */
-    if (doc instanceof HTMLDocument) {
-	if (try_body_started(waiting))
-	    return;
+    if (try_body_started(waiting))
+	return;
 
-	waiting.observers = [make_body_start_observer(detached_html, waiting)];
-    }
+    waiting.observers = [make_body_start_observer(detached_html, waiting)];
 
-    waiting.loaded_cb = () => finish_waiting(waiting);
-    doc.addEventListener("DOMContentLoaded", waiting.loaded_cb);
+    wait_loaded(doc).then(() => finish_waiting(waiting));
 }
 
 function wait_for_head(doc, detached_html)
@@ -154,42 +154,43 @@ function wait_for_head(doc, detached_html)
 
 const blocked_str = "blocked";
 
-function block_attribute(node, attr)
+function block_attribute(node, attr, ns=null)
 {
+    const [hasa, geta, seta, rema] = ["has", "get", "set", "remove"]
+	  .map(m => (n, ...args) => typeof ns === "string" ?
+	       n[`${m}AttributeNS`](ns, ...args) : n[`${m}Attribute`](...args));
     /*
-     * Disabling attributes this way allows them to still be relatively
-     * easily accessed in case they contain some useful data.
+     * Disabling attributes by prepending `-blocked' allows them to still be
+     * relatively easily accessed in case they contain some useful data.
      */
     const construct_name = [attr];
-    while (node.hasAttribute(construct_name.join("")))
+    while (hasa(node, construct_name.join("")))
 	construct_name.unshift(blocked_str);
 
     while (construct_name.length > 1) {
 	construct_name.shift();
 	const name = construct_name.join("");
-	node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name));
+	seta(node, `${blocked_str}-${name}`, geta(node, name));
     }
-
-    node.removeAttribute(attr);
 }
 
 function sanitize_meta(meta, policy)
 {
-    const http_equiv = meta.getAttribute("http-equiv");
-    const value = meta.content;
+    const value = meta.content || "";
 
-    if (!value || !is_csp_header_name(http_equiv, true))
+    if (!value || !is_csp_header_name(meta.httpEquiv || "", true))
 	return;
 
     block_attribute(meta, "content");
-
-    if (is_csp_header_name(http_equiv, false))
-	meta.content = sanitize_csp_header({value}, policy).value;
 }
 
+/*
+ * Used to disable <script> that has not yet been added to live DOM (doesn't
+ * work for those already added).
+ */
 function sanitize_script(script)
 {
-    script.hachette_blocked_type = script.type;
+    script.hachette_blocked_type = script.getAttribute("type");
     script.type = "text/plain";
 }
 
@@ -201,102 +202,101 @@ function desanitize_script(script, policy)
 {
     script.setAttribute("type", script.hachette_blocked_type);
 
-    if (script.hachette_blocked_type === undefined)
+    if (script.hachette_blocked_type === null)
 	script.removeAttribute("type");
 
     delete script.hachette_blocked_type;
 }
 
-function apply_hachette_csp_rules(doc, head, policy)
-{
-    const meta = doc.createElement("meta");
-    meta.setAttribute("http-equiv", "Content-Security-Policy");
-    meta.setAttribute("content", csp_rule(policy.nonce));
-    head.append(meta);
-    /* CSP is already in effect, we can remove the <meta> now. */
-    meta.remove();
-}
-
+const bad_url_reg = /^data:([^,;]*ml|unknown-content-type)/i;
 function sanitize_urls(element)
 {
-    for (const attribute of [...element.attributes]) {
-	if (/^(href|src|data)$/i.test(attribute.localName) &&
-	    /^data:([^,;]*ml|unknown-content-type)/i.test(attribute.value))
-	    block_attribute(element, attribute.localName);
-    }
+    for (const attr of [...element.attributes || []]
+	       .filter(attr => /^(href|src|data)$/i.test(attr.localName))
+	       .filter(attr => bad_url_reg.test(attr.value)))
+	block_attribute(element, attr.localName, attr.namespaceURI);
 }
 
 function start_data_urls_sanitizing(doc)
 {
     doc.querySelectorAll("*[href], *[src], *[data]").forEach(sanitize_urls);
-    const mutation_handler = m => m.addedNodes.forEach(sanitize_urls);
-    const mo = new MutationObserver(ms => ms.forEach(mutation_handler));
-    mo.observe(doc, {childList: true, subtree: true});
+    if (!doc.content_loaded) {
+	const mutation_handler = m => m.addedNodes.forEach(sanitize_urls);
+	const mo = new MutationObserver(ms => ms.forEach(mutation_handler));
+	mo.observe(doc, {childList: true, subtree: true});
+	wait_loaded(doc).then(() => mo.disconnect());
+    }
 }
 
-function apply_intrinsics_sanitizing(root_element)
+/*
+ * Normally, we block scripts with CSP. However, Mozilla does optimizations that
+ * cause part of the DOM to be loaded when our content scripts get to run. Thus,
+ * before the CSP rules we inject (for non-HTTP pages) become effective, we need
+ * to somehow block the execution of `<script>'s and intrinsics that were
+ * already there.
+ */
+function mozilla_initial_block(doc)
 {
-    for (const subelem of root_element.querySelectorAll("*")) {
-	[...subelem.attributes]
-	    .filter(a => /^on/i.test(a.localName))
-	    .filter(a => /^javascript:/i.test(a.value))
-	    .forEach(a => block_attribute(subelem, a.localName));
-    }
+    const blocker = e => e.preventDefault();
+    doc.addEventListener("beforescriptexecute", blocker);
+    setTimeout(() => doc.removeEventListener("beforescriptexecute", blocker));
+
+    [...doc.all].flatMap(ele => [...ele.attributes].map(attr => [ele, attr]))
+	.map(([ele, attr]) => [ele, attr.localName])
+	.filter(([ele, attr]) => /^on/.test(attr) && ele.wrappedJSObject[attr])
+	.forEach(([ele, attr]) => ele.wrappedJSObject[attr] = null);
 }
 
+/*
+ * Here we block all scripts of a document which might be either and
+ * HTMLDocument or an XMLDocument. Modifying an XML document might disrupt
+ * Mozilla's XML preview. This is an unfortunate thing we have to accept for
+ * now. XML documents *have to* be sanitized as well because they might
+ * contain `<script>' tags (or on* attributes) with namespace declared as
+ * "http://www.w3.org/1999/xhtml" or "http://www.w3.org/2000/svg" which allows
+ * javascript execution.
+ */
 async function sanitize_document(doc, policy)
 {
     /*
      * Blocking of scripts that are in the DOM from the beginning. Needed for
-     * Mozilla, harmless on Chromium.
-     * Note that at least in SVG documents the `src' attr on `<script>'s seems
-     * to be ignored by Firefox, so we don't need to sanitize it.
+     * Mozilla.
      */
-    for (const script of document.getElementsByTagName("script")) {
-	const old_children = [...script.childNodes];
-	script.innerHTML = "";
-	setTimeout(() => old_children.forEach(c => script.append(c)), 0);
-    }
+    if (is_mozilla)
+	mozilla_initial_block(doc);
 
     /*
      * Ensure our CSP rules are employed from the beginning. This CSP injection
      * method is, when possible, going to be applied together with CSP rules
      * injected using webRequest.
-     * For non-HTML documents this is just a dummy operation of adding and
-     * removing `head'.
+     * Using elements namespaced as HTML makes this CSP injection also work for
+     * non-HTML documents.
      */
-    let added_head = doc.createElement("head");
-    if (!doc.head)
-	doc.documentElement.prepend(added_head);
-
-    apply_hachette_csp_rules(doc, added_head, policy);
-
-    /* Proceed with DOM in its initial state. */
-    added_head.remove();
+    const html = new DOMParser().parseFromString(`<html><head><meta \
+http-equiv="Content-Security-Policy" content="${csp_rule(policy.nonce)}"\
+/></head><body>Loading...</body></html>`, "text/html").documentElement;
 
     /*
-     * <html> node gets hijacked now, to be re-attached after <head> is loaded
+     * Root node gets hijacked now, to be re-attached after <head> is loaded
      * and sanitized.
      */
-    const old_html = doc.documentElement;
-    const new_html = doc.createElement("html");
-    old_html.replaceWith(new_html);
+    const root = doc.documentElement;
+    root.replaceWith(html);
 
-    await wait_for_head(doc, old_html);
-
-    for (const meta of old_html.querySelectorAll("head meta"))
-	sanitize_meta(meta, policy);
-
-    for (const script of old_html.querySelectorAll("script"))
-	sanitize_script(script, policy);
-
-    if (!(doc instanceof HTMLDocument))
-	apply_intrinsics_sanitizing(old_html);
+    /*
+     * For XML documents, we don't intend to inject payload, so we neither block
+     * document's CSP `<meta>' tags nor wait for `<head>' to be parsed.
+     */
+    if (document instanceof HTMLDocument) {
+	await wait_for_head(doc, root);
 
-    new_html.replaceWith(old_html);
+	root.querySelectorAll("head meta")
+	    .forEach(m => sanitize_meta(m, policy));
+    }
 
-    for (const script of old_html.querySelectorAll("script"))
-	desanitize_script(script, policy);
+    root.querySelectorAll("script").forEach(s => sanitize_script(s, policy));
+    html.replaceWith(root);
+    root.querySelectorAll("script").forEach(s => desanitize_script(s, policy));
 
     start_data_urls_sanitizing(doc);
 }
@@ -329,14 +329,15 @@ if (!is_privileged_url(document.URL)) {
     }
 
     if (!policy) {
-	console.warn("Using fallback policy!");
+	console.debug("Using fallback policy!");
 	policy = {allow: false, nonce: gen_nonce()};
     }
 
+    console.debug("current policy", policy);
+
     const doc_ready = Promise.all([
-	policy.allow ? Promise.resolve : sanitize_document(document, policy),
-	new Promise(cb => document.addEventListener("DOMContentLoaded",
-						    cb, {once: true}))
+	policy.allow ? Promise.resolve() : sanitize_document(document, policy),
+	wait_loaded(document)
     ]);
 
     handle_page_actions(policy.nonce, policy_received_callback, doc_ready);
diff --git a/content/page_actions.js b/content/page_actions.js
index 8057541..040b4ab 100644
--- a/content/page_actions.js
+++ b/content/page_actions.js
@@ -11,7 +11,7 @@
  * IMPORT browser
  * IMPORT report_script
  * IMPORT report_settings
- * IMPORT report_content_type
+ * IMPORT report_document_type
  * IMPORTS_END
  */
 
@@ -70,8 +70,8 @@ function handle_page_actions(script_nonce, policy_received_cb,
 			     doc_ready_promise) {
     policy_received_callback = policy_received_cb;
     url = document.URL;
-    is_html = /html/.test(document.contentType);
-    report_content_type(document.contentType);
+    is_html = document instanceof HTMLDocument;
+    report_document_type(is_html);
 
     doc_ready_promise.then(document_ready);