diff options
-rw-r--r-- | background/main.js | 6 | ||||
-rw-r--r-- | background/stream_filter.js | 47 | ||||
-rw-r--r-- | common/misc.js | 2 | ||||
-rw-r--r-- | content/main.js | 178 | ||||
-rw-r--r-- | content/page_actions.js | 8 | ||||
-rw-r--r-- | content/sanitize_document.js | 344 |
6 files changed, 200 insertions, 385 deletions
diff --git a/background/main.js b/background/main.js index b1c252a..03cd5d7 100644 --- a/background/main.js +++ b/background/main.js @@ -60,9 +60,11 @@ function on_headers_received(details) return; const [pattern, settings] = query_best(storage, details.url); - const allow = !!(settings ? settings.allow : policy_observable.value); + const has_payload = !!(settings && settings.components); + const allow = !has_payload && + !!(settings ? settings.allow : policy_observable.value); const nonce = gen_nonce(); - const policy = {allow, url, nonce}; + const policy = {allow, url, nonce, has_payload}; let headers = details.responseHeaders; let skip = false; diff --git a/background/stream_filter.js b/background/stream_filter.js index 2dce811..96b6132 100644 --- a/background/stream_filter.js +++ b/background/stream_filter.js @@ -12,6 +12,7 @@ /* * IMPORTS_START * IMPORT browser + * IMPORT is_csp_header_name * IMPORTS_END */ @@ -110,6 +111,35 @@ function create_decoder(properties, data) return new TextDecoder(charset || "latin1"); } +function may_define_csp_rules(html) +{ + const doc = new DOMParser().parseFromString(html, "text/html"); + + for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) { + if (is_csp_header_name(meta.getAttribute("http-equiv"), true) && + meta.content) + return true; + } + + /* + * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML + * data could add some. Before we return `false' we need to be sure we + * reached the start of `<body>' where `<meta>' tags are no longer valid. + */ + + if (doc.documentElement.nextSibling || doc.body.nextSibling || + doc.body.childNodes.length > 1) + return false; + + if (!doc.body.firstChild) + return true; + + if (doc.body.firstChild.nodeName !== "#text") + return false; + + return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText); +} + function filter_data(properties, event) { const data = new Uint8Array(event.data); @@ -118,13 +148,15 @@ function filter_data(properties, event) first_chunk = true; properties.decoder = create_decoder(properties, data); properties.encoder = new TextEncoder(); - /* Force UTF-8, this is the only encoding we can produce. */ - properties.filter.write(new Uint8Array(UTF8_BOM)); } let decoded = properties.decoder.decode(data); - if (first_chunk) { + /* Force UTF-8, this is the only encoding we can produce. */ + if (first_chunk) + properties.filter.write(new Uint8Array(UTF8_BOM)); + + if (first_chunk && may_define_csp_rules(decoded)) { /* * HAX! Our content scripts that execute at `document_start' will always * run before the first script in the document, but under Mozilla some @@ -133,7 +165,14 @@ function filter_data(properties, event) * will force `document_start' to happen earlier. This way our content * scripts will be able to sanitize `http-equiv' tags with CSP rules * that would otherwise stop our injected scripts from executing. + * + * As we want to only process HTML files that happen to have naughty + * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in + * `may_define_rules()'. We don't do any additional MIME sniffing as it + * is too unreliable (and our heuristic will likely mark non-HTML files + * as harmless anyway). */ + const dummy_script = `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`; const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0]; @@ -149,7 +188,7 @@ function filter_data(properties, event) function apply_stream_filter(details, headers, policy) { - if (policy.allow) + if (!policy.has_payload) return headers; const properties = properties_from_headers(headers); diff --git a/common/misc.js b/common/misc.js index fd70f62..91d60d2 100644 --- a/common/misc.js +++ b/common/misc.js @@ -146,7 +146,7 @@ function sanitize_csp_header(header, policy) return {name: header.name, value: new_csp.join('')}; } -/* Regexes and objest to use as/in schemas for parse_json_with_schema(). */ +/* Regexes and objects to use as/in schemas for parse_json_with_schema(). */ const nonempty_string_matcher = /.+/; const matchers = { diff --git a/content/main.js b/content/main.js index da215b9..4fe6d43 100644 --- a/content/main.js +++ b/content/main.js @@ -17,21 +17,12 @@ * IMPORT is_chrome * IMPORT is_mozilla * IMPORT start_activity_info_server - * IMPORT modify_on_the_fly + * IMPORT csp_rule + * IMPORT is_csp_header_name + * IMPORT sanitize_csp_header * IMPORTS_END */ -function accept_node(node, parent) -{ - const clone = document.importNode(node, false); - node.hachette_corresponding = clone; - /* - * TODO: Stop page's own issues like "Error parsing a meta element's - * content:" from appearing as extension's errors. - */ - parent.hachette_corresponding.appendChild(clone); -} - function extract_cookie_policy(cookie, min_time) { let best_result = {time: -1}; @@ -95,6 +86,143 @@ function employ_nonhttp_policy(policy) location.reload(); } +/* + * 1. When injecting some payload we need to sanitize <meta> CSP tags before + * they reach the document. + * 2. Only <meta> tags inside <head> are considered valid by the browser and + * need to be considered. + * 3. We want to detach <html> from document, wait until its <head> completes + * loading, sanitize it and re-attach <html>. + * 4. Browsers are eager to add <meta>'s that appear after `</head>' but before + * `<body>'. Due to this behavior the `DOMContentLoaded' event is considered + * unreliable (although it could still work properly, it is just problematic + * to verify). + * 5. We shall wait for anything to appear in or after <body> and take that as + * a sign <head> has _really_ finished loading. + */ + +function make_body_start_observer(DOM_element, waiting) +{ + const observer = new MutationObserver(() => try_body_started(waiting)); + observer.observe(DOM_element, {childList: true}); + return observer; +} + +function try_body_started(waiting) +{ + const body = waiting.detached_html.querySelector("body"); + + if ((body && (body.firstChild || body.nextSibling)) || + waiting.doc.documentElement.nextSibling) { + finish_waiting(waiting); + return true; + } + + if (body && waiting.observers.length < 2) + waiting.observers.push(make_body_start_observer(body, waiting)); +} + +function finish_waiting(waiting) +{ + waiting.observers.forEach(observer => observer.disconnect()); + waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb); + setTimeout(waiting.callback, 0); +} + +function _wait_for_head(doc, detached_html, callback) +{ + const waiting = {doc, detached_html, callback, observers: []}; + if (try_body_started(waiting)) + return; + + waiting.observers = [make_body_start_observer(detached_html, waiting)]; + waiting.loaded_cb = () => finish_waiting(waiting); + doc.addEventListener("DOMContentLoaded", waiting.loaded_cb); +} + +function wait_for_head(doc, detached_html) +{ + return new Promise(cb => _wait_for_head(doc, detached_html, cb)); +} + +const blocked_str = "blocked"; + +function block_attribute(node, attr) +{ + /* + * Disabling attributes this way allows them to still be relatively + * easily accessed in case they contain some useful data. + */ + const construct_name = [attr]; + while (node.hasAttribute(construct_name.join(""))) + construct_name.unshift(blocked_str); + + while (construct_name.length > 1) { + construct_name.shift(); + const name = construct_name.join(""); + node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name)); + } + + node.removeAttribute(attr); +} + +function sanitize_meta(meta, policy) +{ + const http_equiv = meta.getAttribute("http-equiv"); + const value = meta.content; + + if (!value || !is_csp_header_name(http_equiv, true)) + return; + + block_attribute(meta, "content"); + + if (is_csp_header_name(http_equiv, false)) + meta.content = sanitize_csp_header({value}, policy).value; +} + +function apply_hachette_csp_rules(doc, policy) +{ + const meta = doc.createElement("meta"); + meta.setAttribute("http-equiv", "Content-Security-Policy"); + meta.setAttribute("content", csp_rule(policy.nonce)); + doc.head.append(meta); + /* CSP is already in effect, we can remove the <meta> now. */ + meta.remove(); +} + +async function sanitize_document(doc, policy) +{ + /* + * Ensure our CSP rules are employed from the beginning. This CSP injection + * method is, when possible, going to be applied together with CSP rules + * injected using webRequest. + */ + const has_own_head = doc.head; + if (!has_own_head) + doc.documentElement.prepend(doc.createElement("head")); + + apply_hachette_csp_rules(doc, policy); + + /* Probably not needed, but...: proceed with DOM in its initial state. */ + if (!has_own_head) + doc.head.remove(); + + /* + * <html> node gets hijacked now, to be re-attached after <head> is loaded + * and sanitized. + */ + const old_html = doc.documentElement; + const new_html = doc.createElement("html"); + old_html.replaceWith(new_html); + + await wait_for_head(doc, old_html); + + for (const meta of old_html.querySelectorAll("head meta")) + sanitize_meta(meta, policy); + + new_html.replaceWith(old_html); +} + if (!is_privileged_url(document.URL)) { let policy_received_callback = () => undefined; let policy; @@ -127,25 +255,13 @@ if (!is_privileged_url(document.URL)) { policy = {allow: false, nonce: gen_nonce()}; } - handle_page_actions(policy.nonce, policy_received_callback); - - if (!policy.allow) { - if (is_mozilla) { - const script = document.querySelector("script"); - if (script) - script.textContent = "throw 'blocked';\n" + script.textContent; - } - const old_html = document.documentElement; - const new_html = document.createElement("html"); - old_html.replaceWith(new_html); - old_html.hachette_corresponding = new_html; - - const modify_end = - modify_on_the_fly(old_html, policy, {node_eater: accept_node}); - document.addEventListener("DOMContentLoaded", modify_end); - } + const doc_ready = Promise.all([ + policy.allow ? Promise.resolve : sanitize_document(document, policy), + new Promise(cb => document.addEventListener("DOMContentLoaded", + cb, {once: true})) + ]); + + handle_page_actions(policy.nonce, policy_received_callback, doc_ready); start_activity_info_server(); } - -console.log("content script"); diff --git a/content/page_actions.js b/content/page_actions.js index 3799afd..8057541 100644 --- a/content/page_actions.js +++ b/content/page_actions.js @@ -42,7 +42,7 @@ function handle_message(message) } } -function document_loaded(event) +function document_ready(event) { loaded = true; @@ -66,13 +66,15 @@ function add_script(script_text) report_script(script_text); } -function handle_page_actions(script_nonce, policy_received_cb) { +function handle_page_actions(script_nonce, policy_received_cb, + doc_ready_promise) { policy_received_callback = policy_received_cb; url = document.URL; is_html = /html/.test(document.contentType); report_content_type(document.contentType); - document.addEventListener("DOMContentLoaded", document_loaded); + doc_ready_promise.then(document_ready); + port = browser.runtime.connect({name : CONNECTION_TYPE.PAGE_ACTIONS}); port.onMessage.addListener(handle_message); port.postMessage({url}); diff --git a/content/sanitize_document.js b/content/sanitize_document.js deleted file mode 100644 index 727bb6c..0000000 --- a/content/sanitize_document.js +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Hachette modify HTML document as it loads and reconstruct HTML code from it - * - * Copyright (C) 2021 Wojtek Kosior - * Redistribution terms are gathered in the `copyright' file. - */ - -/* - * IMPORTS_START - * IMPORT gen_nonce - * IMPORT csp_rule - * IMPORT is_csp_header_name - * IMPORT sanitize_csp_header - * IMPORT sanitize_attributes - * IMPORTS_END - */ - -/* - * Functions that sanitize elements. The script blocking measures are, when - * possible, going to be applied together with CSP rules injected using - * webRequest. - */ - -const blocked = "blocked"; - -function block_attribute(node, attr) -{ - /* - * Disabling attributed this way allows them to still be relatively - * easily accessed in case they contain some useful data. - */ - - const construct_name = [attr]; - while (node.hasAttribute(construct_name.join(""))) - construct_name.unshift(blocked); - - while (construct_name.length > 1) { - construct_name.shift(); - const name = construct_name.join(""); - node.setAttribute(`${blocked}-${name}`, node.getAttribute(name)); - } - - node.removeAttribute(attr); -} - -function sanitize_script(script, data) -{ - if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) { - script.remove(); - script.hachette_deleted = true; - script.hachette_ignore = true; - } - - if (data.policy.allow) - return; - - block_attribute(script, "type"); - script.setAttribute("type", "application/json"); -} - -function inject_csp(head, data) -{ - if (data.policy.allow) - return; - - const meta = document.createElement("meta"); - meta.setAttribute("http-equiv", "Content-Security-Policy"); - meta.setAttribute("content", csp_rule(data.policy.nonce)); - meta.hachette_ignore = true; - head.prepend(meta); - - data.new_added.unshift([meta, head]); -} - -function sanitize_http_equiv_csp_rule(meta, data) -{ - const http_equiv = meta.getAttribute("http-equiv"); - const value = meta.content; - - if (!value || !is_csp_header_name(http_equiv, !data.policy.allow)) - return; - - block_attribute(meta, "content"); - - if (data.policy.allow || is_csp_header_name(http_equiv, false)) - meta.content = sanitize_csp_header({value}, data.policy).value; -} - -function sanitize_node(node, data) -{ - if (node.tagName === "SCRIPT") - sanitize_script(node, data); - - if (node.tagName === "HEAD") - inject_csp(node, data); - - if (node.tagName === "META") - sanitize_http_equiv_csp_rule(node, data); - - if (!data.policy.allow) - sanitize_attributes(node, data); -} - -/* - * Instead of calling writer directly with multiple small chunks of reconstruced - * HTML code, we utilize `setTimeout()' to only have it called once, - * asynchronously. - */ -function do_write_callback(data) -{ - data.writer(data.chunks.join("")); - data.chunks = []; - - if (data.finished && data.finisher) - data.finisher(); -} - -function do_write(chunk, data) -{ - data.chunks.push(chunk); - clearTimeout(data.write_timeout); - data.write_timeout = setTimeout(() => do_write_callback(data), 0); -} - -const serializer = new XMLSerializer(); - -function start_serializing_node(node, data) -{ - node.hachette_started = true; - - if (!data.writer) - return; - - const clone = node.cloneNode(false); - clone.textContent = data.uniq; - do_write(data.uniq_reg.exec(clone.outerHTML)[1], data); -} - -function finish_serializing_node(node, data) -{ - const nodes_to_process = [node]; - - while (true) { - node = nodes_to_process.pop(); - if (!node) - break; - - nodes_to_process.push(node, node.hachette_last_added); - } - - while (nodes_to_process.length > 0) { - const node = nodes_to_process.pop(); - node.remove(); - node.hachette_ignore = true; - - if (!data.writer) - continue; - - if (node.hachette_started) { - node.textContent = data.uniq; - do_write(data.uniq_reg.exec(node.outerHTML)[2], data); - continue; - } - - do_write(node.outerHTML || serializer.serializeToString(node), data); - } -} - -function process_initial_nodes(node, data) -{ - if (data.processed_initial_nodes) - return; - - data.processed_initial_nodes = true; - - start_serializing_node(data.html_root, data); - - const new_added = []; - const nodes_to_process = [data.html_root]; - - let i = 0; - while (nodes_to_process.length > 0) { - let current = nodes_to_process.shift(); - - if (current.firstChild) { - if (current.firstChild === node) - break; - nodes_to_process.unshift(current.firstChild, current); - new_added.push([current.firstChild, current]); - continue; - } - - while (current && !current.nextSibling) - current = nodes_to_process.shift(); - - if (!current || current.nextSibling === node) - break; - - nodes_to_process.unshift(current.nextSibling); - new_added.push([current.nextSibling, nodes_to_process[1]]); - } - - data.new_added.unshift(...new_added); -} - -/* - * Important! Due to some weirdness node.parentElement is not alway correct - * in MutationRecords under Chromium. Track node relations manually. - */ -function handle_added_node(node, true_parent, data) -{ - /* - * Functions we call here might cause new nodes to be injected or found - * that require processing before the one we got in function argument. - * We rely on those functions putting the node(s) they create/find at the - * very beginning of the `new_added' queue and (for created nodes) setting - * their `hachette_ignore' property, based on which their MutationRecord - * will not be processed. A function can also mark a node already in the - * `new_added' queue as not eligible for processing by setting its - * `hachette_deleted' property. - */ - - process_initial_nodes(node, data); - - data.new_added.push([node, true_parent]); - - while (data.new_added.length > 0) { - [node, true_parent] = data.new_added.shift(); - - if (true_parent.hachette_deleted) - node.hachette_deleted = true; - if (node.hachette_deleted) - continue; - - if (!true_parent.hachette_started) - start_serializing_node(true_parent, data) - - if (!node.hachette_ignore) - sanitize_node(node, data); - - if (node.hachette_deleted) - continue; - - if (data.node_eater) - data.node_eater(node, true_parent); - - finish_serializing_node(true_parent.hachette_last_added, data); - - true_parent.hachette_last_added = node; - } -} - -function handle_mutation(mutations, data) -{ - /* - * Chromium: for an unknown reason mutation.target is not always the same as - * node.parentElement. The former is the correct one. - */ - for (const mutation of mutations) { - for (const node of mutation.addedNodes) { - /* Check for nodes added by ourselves. */ - if (mutation.target.hachette_ignore) - node.hachette_ignore = true; - if (node.hachette_ignore) - continue; - - handle_added_node(node, mutation.target, data); - } - } -} - -function finish_processing(data) -{ - process_initial_nodes(undefined, data); - - /* - * The `finisher' callback should be called, if provided. Normally our - * function that performs the last write does it after seeing `finished' - * set to `true'. If, however, there's no `writer' callback and hence no - * writes to perform, we need to take care of calling `finisher' here. - */ - data.finished = true; - handle_mutation(data.observer.takeRecords(), data); - data.observer.disconnect(); - - /* - * Additional whitespace that was after `</body>' gets appended to body. - * Although it's a minor issue, it is not what we want. There's no way to - * tell exactly what part of that whitespace was after `</body>' and what - * was before, so we just replace it with a single newline which looks good - * when printed. - */ - const body = data.html_root.lastChild; - const text = body && body.tagName === "BODY" && body.lastChild; - if (text && text.nodeName === "#text") { - const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || ""; - text.textContent = new_content + "\n"; - } - - finish_serializing_node(data.html_root, data); - if (!data.writer && data.finisher) - setTimeout(data.finisher, 0); -} - -/* - * This function sanitizes `html_root' according to `policy'. It is capable of - * working on an HTML document that is being written to, sanitizing new nodes - * as they appear. - * - * `consumers' object may contain 3 optional callback functions: `writer', - * `node_eater' and `finisher'. The first one, if present, is called with chunks - * of reconstructed HTML code. The second one, if present, gets called for every - * added node with 2 arguments: that node and its parent. The third one is - * called at the end, after all processing has been done. - * - * `modify_on_the_fly()' returns a callback that should be called (with no - * arguments) once the document of html_root has finished being written to. - * Unfortunately, due to specifics behavior of document that has had its - * documentElement replaced - */ -function modify_on_the_fly(html_root, policy, consumers) -{ - const uniq = gen_nonce(); - const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`); - const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []}; - Object.assign(data, consumers); - - var observer = new MutationObserver(m => handle_mutation(m, data)); - observer.observe(data.html_root, { - attributes: true, - childList: true, - subtree: true - }); - - data.observer = observer; - - return () => finish_processing(data); -} - -/* - * EXPORTS_START - * EXPORT modify_on_the_fly - * EXPORTS_END - */ |