/** * Hachette modify HTML document as it loads and reconstruct HTML code from it * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ /* * IMPORTS_START * IMPORT gen_nonce * IMPORT csp_rule * IMPORT is_csp_header_name * IMPORT sanitize_csp_header * IMPORT sanitize_attributes * IMPORTS_END */ /* * Functions that sanitize elements. The script blocking measures are, when * possible, going to be applied together with CSP rules injected using * webRequest. */ const blocked = "blocked"; function block_attribute(node, attr) { /* * Disabling attributed this way allows them to still be relatively * easily accessed in case they contain some useful data. */ const construct_name = [attr]; while (node.hasAttribute(construct_name.join(""))) construct_name.unshift(blocked); while (construct_name.length > 1) { construct_name.shift(); const name = construct_name.join(""); node.setAttribute(`${blocked}-${name}`, node.getAttribute(name)); } node.removeAttribute(attr); } function sanitize_script(script, data) { if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) { script.remove(); script.hachette_deleted = true; script.hachette_ignore = true; } if (data.policy.allow) return; block_attribute(script, "type"); script.setAttribute("type", "application/json"); } function inject_csp(head, data) { if (data.policy.allow) return; const meta = document.createElement("meta"); meta.setAttribute("http-equiv", "Content-Security-Policy"); meta.setAttribute("content", csp_rule(data.policy.nonce)); meta.hachette_ignore = true; head.prepend(meta); data.new_added.unshift([meta, head]); } function sanitize_http_equiv_csp_rule(meta, data) { const http_equiv = meta.getAttribute("http-equiv"); const value = meta.content; if (!value || !is_csp_header_name(http_equiv, !data.policy.allow)) return; block_attribute(meta, "content"); if (data.policy.allow || is_csp_header_name(http_equiv, false)) meta.content = sanitize_csp_header({value}, data.policy).value; } function sanitize_node(node, data) { if (node.tagName === "SCRIPT") sanitize_script(node, data); if (node.tagName === "HEAD") inject_csp(node, data); if (node.tagName === "META") sanitize_http_equiv_csp_rule(node, data); if (!data.policy.allow) sanitize_attributes(node, data); } /* * Instead of calling writer directly with multiple small chunks of reconstruced * HTML code, we utilize `setTimeout()' to only have it called once, * asynchronously. */ function do_write_callback(data) { data.writer(data.chunks.join("")); data.chunks = []; if (data.finished && data.finisher) data.finisher(); } function do_write(chunk, data) { data.chunks.push(chunk); clearTimeout(data.write_timeout); data.write_timeout = setTimeout(() => do_write_callback(data), 0); } const serializer = new XMLSerializer(); function start_serializing_node(node, data) { node.hachette_started = true; if (!data.writer) return; const clone = node.cloneNode(false); clone.textContent = data.uniq; do_write(data.uniq_reg.exec(clone.outerHTML)[1], data); } function finish_serializing_node(node, data) { const nodes_to_process = [node]; while (true) { node = nodes_to_process.pop(); if (!node) break; nodes_to_process.push(node, node.hachette_last_added); } while (nodes_to_process.length > 0) { const node = nodes_to_process.pop(); node.remove(); node.hachette_ignore = true; if (!data.writer) continue; if (node.hachette_started) { node.textContent = data.uniq; do_write(data.uniq_reg.exec(node.outerHTML)[2], data); continue; } do_write(node.outerHTML || serializer.serializeToString(node), data); } } function process_initial_nodes(node, data) { if (data.processed_initial_nodes) return; data.processed_initial_nodes = true; start_serializing_node(data.html_root, data); const new_added = []; const nodes_to_process = [data.html_root]; let i = 0; while (nodes_to_process.length > 0) { let current = nodes_to_process.shift(); if (current.firstChild) { if (current.firstChild === node) break; nodes_to_process.unshift(current.firstChild, current); new_added.push([current.firstChild, current]); continue; } while (current && !current.nextSibling) current = nodes_to_process.shift(); if (!current || current.nextSibling === node) break; nodes_to_process.unshift(current.nextSibling); new_added.push([current.nextSibling, nodes_to_process[1]]); } data.new_added.unshift(...new_added); } /* * Important! Due to some weirdness node.parentElement is not alway correct * in MutationRecords under Chromium. Track node relations manually. */ function handle_added_node(node, true_parent, data) { /* * Functions we call here might cause new nodes to be injected or found * that require processing before the one we got in function argument. * We rely on those functions putting the node(s) they create/find at the * very beginning of the `new_added' queue and (for created nodes) setting * their `hachette_ignore' property, based on which their MutationRecord * will not be processed. A function can also mark a node already in the * `new_added' queue as not eligible for processing by setting its * `hachette_deleted' property. */ process_initial_nodes(node, data); data.new_added.push([node, true_parent]); while (data.new_added.length > 0) { [node, true_parent] = data.new_added.shift(); if (true_parent.hachette_deleted) node.hachette_deleted = true; if (node.hachette_deleted) continue; if (!true_parent.hachette_started) start_serializing_node(true_parent, data) if (!node.hachette_ignore) sanitize_node(node, data); if (node.hachette_deleted) continue; if (data.node_eater) data.node_eater(node, true_parent); finish_serializing_node(true_parent.hachette_last_added, data); true_parent.hachette_last_added = node; } } function handle_mutation(mutations, data) { /* * Chromium: for an unknown reason mutation.target is not always the same as * node.parentElement. The former is the correct one. */ for (const mutation of mutations) { for (const node of mutation.addedNodes) { /* Check for nodes added by ourselves. */ if (mutation.target.hachette_ignore) node.hachette_ignore = true; if (node.hachette_ignore) continue; handle_added_node(node, mutation.target, data); } } } function finish_processing(data) { process_initial_nodes(undefined, data); /* * The `finisher' callback should be called, if provided. Normally our * function that performs the last write does it after seeing `finished' * set to `true'. If, however, there's no `writer' callback and hence no * writes to perform, we need to take care of calling `finisher' here. */ data.finished = true; handle_mutation(data.observer.takeRecords(), data); data.observer.disconnect(); /* * Additional whitespace that was after `' gets appended to body. * Although it's a minor issue, it is not what we want. There's no way to * tell exactly what part of that whitespace was after `' and what * was before, so we just replace it with a single newline which looks good * when printed. */ const body = data.html_root.lastChild; const text = body && body.tagName === "BODY" && body.lastChild; if (text && text.nodeName === "#text") { const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || ""; text.textContent = new_content + "\n"; } finish_serializing_node(data.html_root, data); if (!data.writer && data.finisher) setTimeout(data.finisher, 0); } /* * This function sanitizes `html_root' according to `policy'. It is capable of * working on an HTML document that is being written to, sanitizing new nodes * as they appear. * * `consumers' object may contain 3 optional callback functions: `writer', * `node_eater' and `finisher'. The first one, if present, is called with chunks * of reconstructed HTML code. The second one, if present, gets called for every * added node with 2 arguments: that node and its parent. The third one is * called at the end, after all processing has been done. * * `modify_on_the_fly()' returns a callback that should be called (with no * arguments) once the document of html_root has finished being written to. * Unfortunately, due to specifics behavior of document that has had its * documentElement replaced */ function modify_on_the_fly(html_root, policy, consumers) { const uniq = gen_nonce(); const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`); const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []}; Object.assign(data, consumers); var observer = new MutationObserver(m => handle_mutation(m, data)); observer.observe(data.html_root, { attributes: true, childList: true, subtree: true }); data.observer = observer; return () => finish_processing(data); } /* * EXPORTS_START * EXPORT modify_on_the_fly * EXPORTS_END */