1 files changed, 182 insertions, 47 deletions
diff --git a/content/sanitize_document.js b/content/sanitize_document.js
index 1533526..727bb6c 100644
--- a/content/sanitize_document.js
+++ b/content/sanitize_document.js
@@ -43,76 +43,100 @@ function block_attribute(node, attr)
     node.removeAttribute(attr);
 }
 
-function sanitize_script(script, policy)
+function sanitize_script(script, data)
 {
-    if (policy.allow)
+    if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
+	script.remove();
+	script.hachette_deleted = true;
+	script.hachette_ignore = true;
+    }
+
+    if (data.policy.allow)
 	return;
 
     block_attribute(script, "type");
     script.setAttribute("type", "application/json");
 }
 
-function inject_csp(head, policy)
+function inject_csp(head, data)
 {
-    if (policy.allow)
+    if (data.policy.allow)
 	return;
 
     const meta = document.createElement("meta");
     meta.setAttribute("http-equiv", "Content-Security-Policy");
-    meta.setAttribute("content", csp_rule(policy.nonce));
+    meta.setAttribute("content", csp_rule(data.policy.nonce));
     meta.hachette_ignore = true;
     head.prepend(meta);
+
+    data.new_added.unshift([meta, head]);
 }
 
-function sanitize_http_equiv_csp_rule(meta, policy)
+function sanitize_http_equiv_csp_rule(meta, data)
 {
     const http_equiv = meta.getAttribute("http-equiv");
+    const value = meta.content;
 
-    if (!is_csp_header_name(http_equiv, !policy.allow))
+    if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
 	return;
 
-    if (policy.allow || is_csp_header_name(http_equiv, false)) {
-	let value = meta.getAttribute("content");
-	block_attribute(meta, "content");
-	if (value) {
-	    value = sanitize_csp_header({value}, policy).value;
-	    meta.setAttribute("content", value);
-	}
-	return;
-    }
+    block_attribute(meta, "content");
 
-    block_attribute(meta, "http-equiv");
+    if (data.policy.allow || is_csp_header_name(http_equiv, false))
+	meta.content = sanitize_csp_header({value}, data.policy).value;
 }
 
-function sanitize_node(node, policy)
+function sanitize_node(node, data)
 {
     if (node.tagName === "SCRIPT")
-	sanitize_script(node, policy);
+	sanitize_script(node, data);
 
     if (node.tagName === "HEAD")
-	inject_csp(node, policy);
+	inject_csp(node, data);
 
     if (node.tagName === "META")
-	sanitize_http_equiv_csp_rule(node, policy);
+	sanitize_http_equiv_csp_rule(node, data);
+
+    if (!data.policy.allow)
+	sanitize_attributes(node, data);
+}
 
-    if (!policy.allow)
-	sanitize_attributes(node, policy);
+/*
+ * Instead of calling writer directly with multiple small chunks of reconstruced
+ * HTML code, we utilize `setTimeout()' to only have it called once,
+ * asynchronously.
+ */
+function do_write_callback(data)
+{
+    data.writer(data.chunks.join(""));
+    data.chunks = [];
+
+    if (data.finished && data.finisher)
+	data.finisher();
+}
+
+function do_write(chunk, data)
+{
+    data.chunks.push(chunk);
+    clearTimeout(data.write_timeout);
+    data.write_timeout = setTimeout(() => do_write_callback(data), 0);
 }
 
 const serializer = new XMLSerializer();
 
-function start_node(node, data)
+function start_serializing_node(node, data)
 {
+    node.hachette_started = true;
+
     if (!data.writer)
 	return;
 
-    node.hachette_started = true;
     const clone = node.cloneNode(false);
     clone.textContent = data.uniq;
-    data.writer(data.uniq_reg.exec(clone.outerHTML)[1]);
+    do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
 }
 
-function finish_node(node, data)
+function finish_serializing_node(node, data)
 {
     const nodes_to_process = [node];
 
@@ -127,40 +151,103 @@ function finish_node(node, data)
     while (nodes_to_process.length > 0) {
 	const node = nodes_to_process.pop();
 	node.remove();
+	node.hachette_ignore = true;
 
 	if (!data.writer)
 	    continue;
 
 	if (node.hachette_started) {
 	    node.textContent = data.uniq;
-	    data.writer(data.uniq_reg.exec(node.outerHTML)[2]);
+	    do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
+	    continue;
+	}
+
+	do_write(node.outerHTML || serializer.serializeToString(node), data);
+    }
+}
+
+function process_initial_nodes(node, data)
+{
+    if (data.processed_initial_nodes)
+	return;
+
+    data.processed_initial_nodes = true;
+
+    start_serializing_node(data.html_root, data);
+
+    const new_added = [];
+    const nodes_to_process = [data.html_root];
+
+    let i = 0;
+    while (nodes_to_process.length > 0) {
+	let current = nodes_to_process.shift();
+
+	if (current.firstChild) {
+	    if (current.firstChild === node)
+		break;
+	    nodes_to_process.unshift(current.firstChild, current);
+	    new_added.push([current.firstChild, current]);
 	    continue;
 	}
 
-	data.writer(node.outerHTML || serializer.serializeToString(node));
+	while (current && !current.nextSibling)
+	    current = nodes_to_process.shift();
+
+	if (!current || current.nextSibling === node)
+	    break;
+
+	nodes_to_process.unshift(current.nextSibling);
+	new_added.push([current.nextSibling, nodes_to_process[1]]);
     }
+
+    data.new_added.unshift(...new_added);
 }
 
 /*
  * Important! Due to some weirdness node.parentElement is not alway correct
- * under Chromium. Track node relations manually.
+ * in MutationRecords under Chromium. Track node relations manually.
  */
 function handle_added_node(node, true_parent, data)
 {
-    if (node.hachette_ignore || true_parent.hachette_ignore)
-	return;
+    /*
+     * Functions we call here might cause new nodes to be injected or found
+     * that require processing before the one we got in function argument.
+     * We rely on those functions putting the node(s) they create/find at the
+     * very beginning of the `new_added' queue and (for created nodes) setting
+     * their `hachette_ignore' property, based on which their MutationRecord
+     * will not be processed. A function can also mark a node already in the
+     * `new_added' queue as not eligible for processing by setting its
+     * `hachette_deleted' property.
+     */
 
-    if (!true_parent.hachette_started)
-	start_node(true_parent, data)
+    process_initial_nodes(node, data);
 
-    sanitize_node(node, data.policy);
+    data.new_added.push([node, true_parent]);
 
-    if (data.node_eater)
-	data.node_eater(node, true_parent);
+    while (data.new_added.length > 0) {
+	[node, true_parent] = data.new_added.shift();
 
-    finish_node(true_parent.hachette_last_added, data);
+	if (true_parent.hachette_deleted)
+	    node.hachette_deleted = true;
+	if (node.hachette_deleted)
+	    continue;
+
+	if (!true_parent.hachette_started)
+	    start_serializing_node(true_parent, data)
+
+	if (!node.hachette_ignore)
+	    sanitize_node(node, data);
+
+	if (node.hachette_deleted)
+	    continue;
+
+	if (data.node_eater)
+	    data.node_eater(node, true_parent);
 
-    true_parent.hachette_last_added = node;
+	finish_serializing_node(true_parent.hachette_last_added, data);
+
+	true_parent.hachette_last_added = node;
+    }
 }
 
 function handle_mutation(mutations, data)
@@ -170,28 +257,76 @@ function handle_mutation(mutations, data)
      * node.parentElement. The former is the correct one.
      */
     for (const mutation of mutations) {
-	for (const node of mutation.addedNodes)
+	for (const node of mutation.addedNodes) {
+	    /* Check for nodes added by ourselves. */
+	    if (mutation.target.hachette_ignore)
+		node.hachette_ignore = true;
+	    if (node.hachette_ignore)
+		continue;
+
 	    handle_added_node(node, mutation.target, data);
+	}
     }
 }
 
 function finish_processing(data)
 {
+    process_initial_nodes(undefined, data);
+
+    /*
+     * The `finisher' callback should be called, if provided. Normally our
+     * function that performs the last write does it after seeing `finished'
+     * set to `true'. If, however, there's no `writer' callback and hence no
+     * writes to perform, we need to take care of calling `finisher' here.
+     */
+    data.finished = true;
     handle_mutation(data.observer.takeRecords(), data);
-    finish_node(data.html_element, data);
     data.observer.disconnect();
+
+    /*
+     * Additional whitespace that was after `</body>' gets appended to body.
+     * Although it's a minor issue, it is not what we want. There's no way to
+     * tell exactly what part of that whitespace was after `</body>' and what
+     * was before, so we just replace it with a single newline which looks good
+     * when printed.
+     */
+    const body = data.html_root.lastChild;
+    const text = body && body.tagName === "BODY" && body.lastChild;
+    if (text && text.nodeName === "#text") {
+	const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
+	text.textContent = new_content + "\n";
+    }
+
+    finish_serializing_node(data.html_root, data);
+    if (!data.writer && data.finisher)
+	setTimeout(data.finisher, 0);
 }
 
-function modify_on_the_fly(html_element, policy, consumers)
+/*
+ * This function sanitizes `html_root' according to `policy'. It is capable of
+ * working on an HTML document that is being written to, sanitizing new nodes
+ * as they appear.
+ *
+ * `consumers' object may contain 3 optional callback functions: `writer',
+ * `node_eater' and `finisher'. The first one, if present, is called with chunks
+ * of reconstructed HTML code. The second one, if present, gets called for every
+ * added node with 2 arguments: that node and its parent. The third one is
+ * called at the end, after all processing has been done.
+ *
+ * `modify_on_the_fly()' returns a callback that should be called (with no
+ * arguments) once the document of html_root has finished being written to.
+ * Unfortunately, due to specifics behavior of document that has had its
+ * documentElement replaced
+ */
+function modify_on_the_fly(html_root, policy, consumers)
 {
     const uniq = gen_nonce();
-    const uniq_reg = new RegExp(`^(.*)${uniq}(.*)$`);
-    const data = {policy, html_element, uniq, uniq_reg, ...consumers};
-
-    start_node(data.html_element, data);
+    const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
+    const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
+    Object.assign(data, consumers);
 
     var observer = new MutationObserver(m => handle_mutation(m, data));
-    observer.observe(data.html_element, {
+    observer.observe(data.html_root, {
      	attributes: true,
 	childList: true,
 	subtree: true