diff options
Diffstat (limited to 'content/sanitize_document.js')
-rw-r--r-- | content/sanitize_document.js | 344 |
1 files changed, 0 insertions, 344 deletions
diff --git a/content/sanitize_document.js b/content/sanitize_document.js deleted file mode 100644 index 727bb6c..0000000 --- a/content/sanitize_document.js +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Hachette modify HTML document as it loads and reconstruct HTML code from it - * - * Copyright (C) 2021 Wojtek Kosior - * Redistribution terms are gathered in the `copyright' file. - */ - -/* - * IMPORTS_START - * IMPORT gen_nonce - * IMPORT csp_rule - * IMPORT is_csp_header_name - * IMPORT sanitize_csp_header - * IMPORT sanitize_attributes - * IMPORTS_END - */ - -/* - * Functions that sanitize elements. The script blocking measures are, when - * possible, going to be applied together with CSP rules injected using - * webRequest. - */ - -const blocked = "blocked"; - -function block_attribute(node, attr) -{ - /* - * Disabling attributed this way allows them to still be relatively - * easily accessed in case they contain some useful data. - */ - - const construct_name = [attr]; - while (node.hasAttribute(construct_name.join(""))) - construct_name.unshift(blocked); - - while (construct_name.length > 1) { - construct_name.shift(); - const name = construct_name.join(""); - node.setAttribute(`${blocked}-${name}`, node.getAttribute(name)); - } - - node.removeAttribute(attr); -} - -function sanitize_script(script, data) -{ - if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) { - script.remove(); - script.hachette_deleted = true; - script.hachette_ignore = true; - } - - if (data.policy.allow) - return; - - block_attribute(script, "type"); - script.setAttribute("type", "application/json"); -} - -function inject_csp(head, data) -{ - if (data.policy.allow) - return; - - const meta = document.createElement("meta"); - meta.setAttribute("http-equiv", "Content-Security-Policy"); - meta.setAttribute("content", csp_rule(data.policy.nonce)); - meta.hachette_ignore = true; - head.prepend(meta); - - data.new_added.unshift([meta, head]); -} - -function sanitize_http_equiv_csp_rule(meta, data) -{ - const http_equiv = meta.getAttribute("http-equiv"); - const value = meta.content; - - if (!value || !is_csp_header_name(http_equiv, !data.policy.allow)) - return; - - block_attribute(meta, "content"); - - if (data.policy.allow || is_csp_header_name(http_equiv, false)) - meta.content = sanitize_csp_header({value}, data.policy).value; -} - -function sanitize_node(node, data) -{ - if (node.tagName === "SCRIPT") - sanitize_script(node, data); - - if (node.tagName === "HEAD") - inject_csp(node, data); - - if (node.tagName === "META") - sanitize_http_equiv_csp_rule(node, data); - - if (!data.policy.allow) - sanitize_attributes(node, data); -} - -/* - * Instead of calling writer directly with multiple small chunks of reconstruced - * HTML code, we utilize `setTimeout()' to only have it called once, - * asynchronously. - */ -function do_write_callback(data) -{ - data.writer(data.chunks.join("")); - data.chunks = []; - - if (data.finished && data.finisher) - data.finisher(); -} - -function do_write(chunk, data) -{ - data.chunks.push(chunk); - clearTimeout(data.write_timeout); - data.write_timeout = setTimeout(() => do_write_callback(data), 0); -} - -const serializer = new XMLSerializer(); - -function start_serializing_node(node, data) -{ - node.hachette_started = true; - - if (!data.writer) - return; - - const clone = node.cloneNode(false); - clone.textContent = data.uniq; - do_write(data.uniq_reg.exec(clone.outerHTML)[1], data); -} - -function finish_serializing_node(node, data) -{ - const nodes_to_process = [node]; - - while (true) { - node = nodes_to_process.pop(); - if (!node) - break; - - nodes_to_process.push(node, node.hachette_last_added); - } - - while (nodes_to_process.length > 0) { - const node = nodes_to_process.pop(); - node.remove(); - node.hachette_ignore = true; - - if (!data.writer) - continue; - - if (node.hachette_started) { - node.textContent = data.uniq; - do_write(data.uniq_reg.exec(node.outerHTML)[2], data); - continue; - } - - do_write(node.outerHTML || serializer.serializeToString(node), data); - } -} - -function process_initial_nodes(node, data) -{ - if (data.processed_initial_nodes) - return; - - data.processed_initial_nodes = true; - - start_serializing_node(data.html_root, data); - - const new_added = []; - const nodes_to_process = [data.html_root]; - - let i = 0; - while (nodes_to_process.length > 0) { - let current = nodes_to_process.shift(); - - if (current.firstChild) { - if (current.firstChild === node) - break; - nodes_to_process.unshift(current.firstChild, current); - new_added.push([current.firstChild, current]); - continue; - } - - while (current && !current.nextSibling) - current = nodes_to_process.shift(); - - if (!current || current.nextSibling === node) - break; - - nodes_to_process.unshift(current.nextSibling); - new_added.push([current.nextSibling, nodes_to_process[1]]); - } - - data.new_added.unshift(...new_added); -} - -/* - * Important! Due to some weirdness node.parentElement is not alway correct - * in MutationRecords under Chromium. Track node relations manually. - */ -function handle_added_node(node, true_parent, data) -{ - /* - * Functions we call here might cause new nodes to be injected or found - * that require processing before the one we got in function argument. - * We rely on those functions putting the node(s) they create/find at the - * very beginning of the `new_added' queue and (for created nodes) setting - * their `hachette_ignore' property, based on which their MutationRecord - * will not be processed. A function can also mark a node already in the - * `new_added' queue as not eligible for processing by setting its - * `hachette_deleted' property. - */ - - process_initial_nodes(node, data); - - data.new_added.push([node, true_parent]); - - while (data.new_added.length > 0) { - [node, true_parent] = data.new_added.shift(); - - if (true_parent.hachette_deleted) - node.hachette_deleted = true; - if (node.hachette_deleted) - continue; - - if (!true_parent.hachette_started) - start_serializing_node(true_parent, data) - - if (!node.hachette_ignore) - sanitize_node(node, data); - - if (node.hachette_deleted) - continue; - - if (data.node_eater) - data.node_eater(node, true_parent); - - finish_serializing_node(true_parent.hachette_last_added, data); - - true_parent.hachette_last_added = node; - } -} - -function handle_mutation(mutations, data) -{ - /* - * Chromium: for an unknown reason mutation.target is not always the same as - * node.parentElement. The former is the correct one. - */ - for (const mutation of mutations) { - for (const node of mutation.addedNodes) { - /* Check for nodes added by ourselves. */ - if (mutation.target.hachette_ignore) - node.hachette_ignore = true; - if (node.hachette_ignore) - continue; - - handle_added_node(node, mutation.target, data); - } - } -} - -function finish_processing(data) -{ - process_initial_nodes(undefined, data); - - /* - * The `finisher' callback should be called, if provided. Normally our - * function that performs the last write does it after seeing `finished' - * set to `true'. If, however, there's no `writer' callback and hence no - * writes to perform, we need to take care of calling `finisher' here. - */ - data.finished = true; - handle_mutation(data.observer.takeRecords(), data); - data.observer.disconnect(); - - /* - * Additional whitespace that was after `</body>' gets appended to body. - * Although it's a minor issue, it is not what we want. There's no way to - * tell exactly what part of that whitespace was after `</body>' and what - * was before, so we just replace it with a single newline which looks good - * when printed. - */ - const body = data.html_root.lastChild; - const text = body && body.tagName === "BODY" && body.lastChild; - if (text && text.nodeName === "#text") { - const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || ""; - text.textContent = new_content + "\n"; - } - - finish_serializing_node(data.html_root, data); - if (!data.writer && data.finisher) - setTimeout(data.finisher, 0); -} - -/* - * This function sanitizes `html_root' according to `policy'. It is capable of - * working on an HTML document that is being written to, sanitizing new nodes - * as they appear. - * - * `consumers' object may contain 3 optional callback functions: `writer', - * `node_eater' and `finisher'. The first one, if present, is called with chunks - * of reconstructed HTML code. The second one, if present, gets called for every - * added node with 2 arguments: that node and its parent. The third one is - * called at the end, after all processing has been done. - * - * `modify_on_the_fly()' returns a callback that should be called (with no - * arguments) once the document of html_root has finished being written to. - * Unfortunately, due to specifics behavior of document that has had its - * documentElement replaced - */ -function modify_on_the_fly(html_root, policy, consumers) -{ - const uniq = gen_nonce(); - const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`); - const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []}; - Object.assign(data, consumers); - - var observer = new MutationObserver(m => handle_mutation(m, data)); - observer.observe(data.html_root, { - attributes: true, - childList: true, - subtree: true - }); - - data.observer = observer; - - return () => finish_processing(data); -} - -/* - * EXPORTS_START - * EXPORT modify_on_the_fly - * EXPORTS_END - */ |