From 44958e6ab4218429475f3c79ecf2116b78a07021 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Thu, 2 Sep 2021 21:32:48 +0200 Subject: implement rethinked tags sanitizing approach This has not been tested yet. Additionally, functionality for blocking of `data:' urls needs to be re-enabled. --- common/misc.js | 2 +- content/main.js | 155 +++++++++++++++++-- content/sanitize_document.js | 344 ------------------------------------------- 3 files changed, 144 insertions(+), 357 deletions(-) delete mode 100644 content/sanitize_document.js diff --git a/common/misc.js b/common/misc.js index 8894d60..30a9e77 100644 --- a/common/misc.js +++ b/common/misc.js @@ -178,7 +178,7 @@ function sanitize_csp_header(header, policy) return {name: header.name, value: new_csp.join('')}; } -/* Regexes and objest to use as/in schemas for parse_json_with_schema(). */ +/* Regexes and objects to use as/in schemas for parse_json_with_schema(). */ const nonempty_string_matcher = /.+/; const matchers = { diff --git a/content/main.js b/content/main.js index 4ae7738..8440eb5 100644 --- a/content/main.js +++ b/content/main.js @@ -16,7 +16,9 @@ * IMPORT is_chrome * IMPORT is_mozilla * IMPORT start_activity_info_server - * IMPORT modify_on_the_fly + * IMPORT csp_rule + * IMPORT is_csp_header_name + * IMPORT sanitize_csp_header * IMPORTS_END */ @@ -31,6 +33,143 @@ function accept_node(node, parent) parent.hachette_corresponding.appendChild(clone); } +/* + * 1. When injecting some payload we need to sanitize CSP tags before + * they reach the document. + * 2. Only tags inside are considered valid by the browser and + * need to be considered. + * 3. We want to detach from document, wait until its completes + * loading, sanitize it and re-attach . + * 4. Browsers are eager to add 's that appear after `' but before + * `'. Due to this behavior the `DOMContentLoaded' event is considered + * unreliable (although it could still work properly, it is just problematic + * to verify). + * 5. We shall wait for anything to appear in or after and take that as + * a sign has _really_ finished loading. + */ + +function make_body_start_observer(DOM_element, waiting) +{ + const observer = new MutationObserver(() => try_body_started(waiting)); + observer.observe(DOM_element, {childList: true}); + return observer; +} + +function try_body_started(waiting) +{ + const body = waiting.detached_html.querySelector("body"); + + if ((body && (body.firstChild || body.nextSibling)) || + waiting.doc.documentElement.nextSibling) { + finish_waiting(waiting); + return true; + } + + if (body && waiting.observers.length < 2) + waiting.observers.push(make_body_start_observer(body, waiting)); +} + +function finish_waiting(waiting) +{ + waiting.observers.forEach(observer => observer.disconnect()); + waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb); + setTimeout(waiting.callback, 0); +} + +function _wait_for_head(doc, detached_html, callback) +{ + const waiting = {doc, detached_html, callback, observers: []}; + if (try_body_started(waiting)) + return; + + waiting.observers = [make_body_start_observer(detached_html, waiting)]; + waiting.loaded_cb = () => finish_waiting(waiting); + doc.addEventListener("DOMContentLoaded", waiting.loaded_cb); +} + +function wait_for_head(doc, detached_html) +{ + return new Promise(cb => _wait_for_head(doc, detached_html, cb)); +} + +const blocked_str = "blocked"; + +function block_attribute(node, attr) +{ + /* + * Disabling attributes this way allows them to still be relatively + * easily accessed in case they contain some useful data. + */ + const construct_name = [attr]; + while (node.hasAttribute(construct_name.join(""))) + construct_name.unshift(blocked_str); + + while (construct_name.length > 1) { + construct_name.shift(); + const name = construct_name.join(""); + node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name)); + } + + node.removeAttribute(attr); +} + +function sanitize_meta(meta, policy) +{ + const http_equiv = meta.getAttribute("http-equiv"); + const value = meta.content; + + if (!value || !is_csp_header_name(http_equiv, true)) + return; + + block_attribute(meta, "content"); + + if (is_csp_header_name(http_equiv, false)) + meta.content = sanitize_csp_header({value}, policy).value; +} + +function apply_hachette_csp_rules(doc, policy) +{ + const meta = doc.createElement("meta"); + meta.setAttribute("http-equiv", "Content-Security-Policy"); + meta.setAttribute("content", csp_rule(policy.nonce)); + doc.head.append(meta); + /* CSP is already in effect, we can remove the now. */ + meta.remove(); +} + +async function sanitize_document(doc, policy) +{ + /* + * Ensure our CSP rules are employed from the beginning. This CSP injection + * method is, when possible, going to be applied together with CSP rules + * injected using webRequest. + */ + const has_own_head = doc.head; + if (!has_own_head) + doc.documentElement.prepend(doc.createElement("head")); + + apply_hachette_csp_rules(doc, policy); + + /* Probably not needed, but...: proceed with DOM in its initial state. */ + if (!has_own_head) + doc.head.remove(); + + /* + * node gets hijacked now, to be re-attached after is loaded + * and sanitized. + */ + const old_html = doc.documentElement; + const new_html = doc.createElement("html"); + old_html.replaceWith(new_html); + + await wait_for_head(doc, old_html); + + for (const meta of old_html.querySelectorAll("head meta")) + sanitize_meta(meta, policy); + + new_html.replaceWith(old_html); +} + if (!is_privileged_url(document.URL)) { const reductor = (ac, [_, sig, pol]) => ac[0] && ac || [extract_signed(sig, pol), sig]; @@ -45,18 +184,10 @@ if (!is_privileged_url(document.URL)) { if (signature) document.cookie = `hachette-${signature}=; Max-Age=-1;`; - handle_page_actions(policy.nonce); + if (!policy.allow) + sanitize_document(document, policy); - if (!policy.allow) { - const old_html = document.documentElement; - const new_html = document.createElement("html"); - old_html.replaceWith(new_html); - old_html.hachette_corresponding = new_html; - - const modify_end = - modify_on_the_fly(old_html, policy, {node_eater: accept_node}); - document.addEventListener("DOMContentLoaded", modify_end); - } + handle_page_actions(policy.nonce); start_activity_info_server(); } diff --git a/content/sanitize_document.js b/content/sanitize_document.js deleted file mode 100644 index 727bb6c..0000000 --- a/content/sanitize_document.js +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Hachette modify HTML document as it loads and reconstruct HTML code from it - * - * Copyright (C) 2021 Wojtek Kosior - * Redistribution terms are gathered in the `copyright' file. - */ - -/* - * IMPORTS_START - * IMPORT gen_nonce - * IMPORT csp_rule - * IMPORT is_csp_header_name - * IMPORT sanitize_csp_header - * IMPORT sanitize_attributes - * IMPORTS_END - */ - -/* - * Functions that sanitize elements. The script blocking measures are, when - * possible, going to be applied together with CSP rules injected using - * webRequest. - */ - -const blocked = "blocked"; - -function block_attribute(node, attr) -{ - /* - * Disabling attributed this way allows them to still be relatively - * easily accessed in case they contain some useful data. - */ - - const construct_name = [attr]; - while (node.hasAttribute(construct_name.join(""))) - construct_name.unshift(blocked); - - while (construct_name.length > 1) { - construct_name.shift(); - const name = construct_name.join(""); - node.setAttribute(`${blocked}-${name}`, node.getAttribute(name)); - } - - node.removeAttribute(attr); -} - -function sanitize_script(script, data) -{ - if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) { - script.remove(); - script.hachette_deleted = true; - script.hachette_ignore = true; - } - - if (data.policy.allow) - return; - - block_attribute(script, "type"); - script.setAttribute("type", "application/json"); -} - -function inject_csp(head, data) -{ - if (data.policy.allow) - return; - - const meta = document.createElement("meta"); - meta.setAttribute("http-equiv", "Content-Security-Policy"); - meta.setAttribute("content", csp_rule(data.policy.nonce)); - meta.hachette_ignore = true; - head.prepend(meta); - - data.new_added.unshift([meta, head]); -} - -function sanitize_http_equiv_csp_rule(meta, data) -{ - const http_equiv = meta.getAttribute("http-equiv"); - const value = meta.content; - - if (!value || !is_csp_header_name(http_equiv, !data.policy.allow)) - return; - - block_attribute(meta, "content"); - - if (data.policy.allow || is_csp_header_name(http_equiv, false)) - meta.content = sanitize_csp_header({value}, data.policy).value; -} - -function sanitize_node(node, data) -{ - if (node.tagName === "SCRIPT") - sanitize_script(node, data); - - if (node.tagName === "HEAD") - inject_csp(node, data); - - if (node.tagName === "META") - sanitize_http_equiv_csp_rule(node, data); - - if (!data.policy.allow) - sanitize_attributes(node, data); -} - -/* - * Instead of calling writer directly with multiple small chunks of reconstruced - * HTML code, we utilize `setTimeout()' to only have it called once, - * asynchronously. - */ -function do_write_callback(data) -{ - data.writer(data.chunks.join("")); - data.chunks = []; - - if (data.finished && data.finisher) - data.finisher(); -} - -function do_write(chunk, data) -{ - data.chunks.push(chunk); - clearTimeout(data.write_timeout); - data.write_timeout = setTimeout(() => do_write_callback(data), 0); -} - -const serializer = new XMLSerializer(); - -function start_serializing_node(node, data) -{ - node.hachette_started = true; - - if (!data.writer) - return; - - const clone = node.cloneNode(false); - clone.textContent = data.uniq; - do_write(data.uniq_reg.exec(clone.outerHTML)[1], data); -} - -function finish_serializing_node(node, data) -{ - const nodes_to_process = [node]; - - while (true) { - node = nodes_to_process.pop(); - if (!node) - break; - - nodes_to_process.push(node, node.hachette_last_added); - } - - while (nodes_to_process.length > 0) { - const node = nodes_to_process.pop(); - node.remove(); - node.hachette_ignore = true; - - if (!data.writer) - continue; - - if (node.hachette_started) { - node.textContent = data.uniq; - do_write(data.uniq_reg.exec(node.outerHTML)[2], data); - continue; - } - - do_write(node.outerHTML || serializer.serializeToString(node), data); - } -} - -function process_initial_nodes(node, data) -{ - if (data.processed_initial_nodes) - return; - - data.processed_initial_nodes = true; - - start_serializing_node(data.html_root, data); - - const new_added = []; - const nodes_to_process = [data.html_root]; - - let i = 0; - while (nodes_to_process.length > 0) { - let current = nodes_to_process.shift(); - - if (current.firstChild) { - if (current.firstChild === node) - break; - nodes_to_process.unshift(current.firstChild, current); - new_added.push([current.firstChild, current]); - continue; - } - - while (current && !current.nextSibling) - current = nodes_to_process.shift(); - - if (!current || current.nextSibling === node) - break; - - nodes_to_process.unshift(current.nextSibling); - new_added.push([current.nextSibling, nodes_to_process[1]]); - } - - data.new_added.unshift(...new_added); -} - -/* - * Important! Due to some weirdness node.parentElement is not alway correct - * in MutationRecords under Chromium. Track node relations manually. - */ -function handle_added_node(node, true_parent, data) -{ - /* - * Functions we call here might cause new nodes to be injected or found - * that require processing before the one we got in function argument. - * We rely on those functions putting the node(s) they create/find at the - * very beginning of the `new_added' queue and (for created nodes) setting - * their `hachette_ignore' property, based on which their MutationRecord - * will not be processed. A function can also mark a node already in the - * `new_added' queue as not eligible for processing by setting its - * `hachette_deleted' property. - */ - - process_initial_nodes(node, data); - - data.new_added.push([node, true_parent]); - - while (data.new_added.length > 0) { - [node, true_parent] = data.new_added.shift(); - - if (true_parent.hachette_deleted) - node.hachette_deleted = true; - if (node.hachette_deleted) - continue; - - if (!true_parent.hachette_started) - start_serializing_node(true_parent, data) - - if (!node.hachette_ignore) - sanitize_node(node, data); - - if (node.hachette_deleted) - continue; - - if (data.node_eater) - data.node_eater(node, true_parent); - - finish_serializing_node(true_parent.hachette_last_added, data); - - true_parent.hachette_last_added = node; - } -} - -function handle_mutation(mutations, data) -{ - /* - * Chromium: for an unknown reason mutation.target is not always the same as - * node.parentElement. The former is the correct one. - */ - for (const mutation of mutations) { - for (const node of mutation.addedNodes) { - /* Check for nodes added by ourselves. */ - if (mutation.target.hachette_ignore) - node.hachette_ignore = true; - if (node.hachette_ignore) - continue; - - handle_added_node(node, mutation.target, data); - } - } -} - -function finish_processing(data) -{ - process_initial_nodes(undefined, data); - - /* - * The `finisher' callback should be called, if provided. Normally our - * function that performs the last write does it after seeing `finished' - * set to `true'. If, however, there's no `writer' callback and hence no - * writes to perform, we need to take care of calling `finisher' here. - */ - data.finished = true; - handle_mutation(data.observer.takeRecords(), data); - data.observer.disconnect(); - - /* - * Additional whitespace that was after `' gets appended to body. - * Although it's a minor issue, it is not what we want. There's no way to - * tell exactly what part of that whitespace was after `' and what - * was before, so we just replace it with a single newline which looks good - * when printed. - */ - const body = data.html_root.lastChild; - const text = body && body.tagName === "BODY" && body.lastChild; - if (text && text.nodeName === "#text") { - const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || ""; - text.textContent = new_content + "\n"; - } - - finish_serializing_node(data.html_root, data); - if (!data.writer && data.finisher) - setTimeout(data.finisher, 0); -} - -/* - * This function sanitizes `html_root' according to `policy'. It is capable of - * working on an HTML document that is being written to, sanitizing new nodes - * as they appear. - * - * `consumers' object may contain 3 optional callback functions: `writer', - * `node_eater' and `finisher'. The first one, if present, is called with chunks - * of reconstructed HTML code. The second one, if present, gets called for every - * added node with 2 arguments: that node and its parent. The third one is - * called at the end, after all processing has been done. - * - * `modify_on_the_fly()' returns a callback that should be called (with no - * arguments) once the document of html_root has finished being written to. - * Unfortunately, due to specifics behavior of document that has had its - * documentElement replaced - */ -function modify_on_the_fly(html_root, policy, consumers) -{ - const uniq = gen_nonce(); - const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`); - const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []}; - Object.assign(data, consumers); - - var observer = new MutationObserver(m => handle_mutation(m, data)); - observer.observe(data.html_root, { - attributes: true, - childList: true, - subtree: true - }); - - data.observer = observer; - - return () => finish_processing(data); -} - -/* - * EXPORTS_START - * EXPORT modify_on_the_fly - * EXPORTS_END - */ -- cgit v1.2.3