summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--background/main.js6
-rw-r--r--background/stream_filter.js47
-rw-r--r--common/misc.js2
-rw-r--r--content/main.js178
-rw-r--r--content/page_actions.js8
-rw-r--r--content/sanitize_document.js344
6 files changed, 200 insertions, 385 deletions
diff --git a/background/main.js b/background/main.js
index b1c252a..03cd5d7 100644
--- a/background/main.js
+++ b/background/main.js
@@ -60,9 +60,11 @@ function on_headers_received(details)
return;
const [pattern, settings] = query_best(storage, details.url);
- const allow = !!(settings ? settings.allow : policy_observable.value);
+ const has_payload = !!(settings && settings.components);
+ const allow = !has_payload &&
+ !!(settings ? settings.allow : policy_observable.value);
const nonce = gen_nonce();
- const policy = {allow, url, nonce};
+ const policy = {allow, url, nonce, has_payload};
let headers = details.responseHeaders;
let skip = false;
diff --git a/background/stream_filter.js b/background/stream_filter.js
index 2dce811..96b6132 100644
--- a/background/stream_filter.js
+++ b/background/stream_filter.js
@@ -12,6 +12,7 @@
/*
* IMPORTS_START
* IMPORT browser
+ * IMPORT is_csp_header_name
* IMPORTS_END
*/
@@ -110,6 +111,35 @@ function create_decoder(properties, data)
return new TextDecoder(charset || "latin1");
}
+function may_define_csp_rules(html)
+{
+ const doc = new DOMParser().parseFromString(html, "text/html");
+
+ for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
+ if (is_csp_header_name(meta.getAttribute("http-equiv"), true) &&
+ meta.content)
+ return true;
+ }
+
+ /*
+ * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
+ * data could add some. Before we return `false' we need to be sure we
+ * reached the start of `<body>' where `<meta>' tags are no longer valid.
+ */
+
+ if (doc.documentElement.nextSibling || doc.body.nextSibling ||
+ doc.body.childNodes.length > 1)
+ return false;
+
+ if (!doc.body.firstChild)
+ return true;
+
+ if (doc.body.firstChild.nodeName !== "#text")
+ return false;
+
+ return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
+}
+
function filter_data(properties, event)
{
const data = new Uint8Array(event.data);
@@ -118,13 +148,15 @@ function filter_data(properties, event)
first_chunk = true;
properties.decoder = create_decoder(properties, data);
properties.encoder = new TextEncoder();
- /* Force UTF-8, this is the only encoding we can produce. */
- properties.filter.write(new Uint8Array(UTF8_BOM));
}
let decoded = properties.decoder.decode(data);
- if (first_chunk) {
+ /* Force UTF-8, this is the only encoding we can produce. */
+ if (first_chunk)
+ properties.filter.write(new Uint8Array(UTF8_BOM));
+
+ if (first_chunk && may_define_csp_rules(decoded)) {
/*
* HAX! Our content scripts that execute at `document_start' will always
* run before the first script in the document, but under Mozilla some
@@ -133,7 +165,14 @@ function filter_data(properties, event)
* will force `document_start' to happen earlier. This way our content
* scripts will be able to sanitize `http-equiv' tags with CSP rules
* that would otherwise stop our injected scripts from executing.
+ *
+ * As we want to only process HTML files that happen to have naughty
+ * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
+ * `may_define_rules()'. We don't do any additional MIME sniffing as it
+ * is too unreliable (and our heuristic will likely mark non-HTML files
+ * as harmless anyway).
*/
+
const dummy_script =
`<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
@@ -149,7 +188,7 @@ function filter_data(properties, event)
function apply_stream_filter(details, headers, policy)
{
- if (policy.allow)
+ if (!policy.has_payload)
return headers;
const properties = properties_from_headers(headers);
diff --git a/common/misc.js b/common/misc.js
index fd70f62..91d60d2 100644
--- a/common/misc.js
+++ b/common/misc.js
@@ -146,7 +146,7 @@ function sanitize_csp_header(header, policy)
return {name: header.name, value: new_csp.join('')};
}
-/* Regexes and objest to use as/in schemas for parse_json_with_schema(). */
+/* Regexes and objects to use as/in schemas for parse_json_with_schema(). */
const nonempty_string_matcher = /.+/;
const matchers = {
diff --git a/content/main.js b/content/main.js
index da215b9..4fe6d43 100644
--- a/content/main.js
+++ b/content/main.js
@@ -17,21 +17,12 @@
* IMPORT is_chrome
* IMPORT is_mozilla
* IMPORT start_activity_info_server
- * IMPORT modify_on_the_fly
+ * IMPORT csp_rule
+ * IMPORT is_csp_header_name
+ * IMPORT sanitize_csp_header
* IMPORTS_END
*/
-function accept_node(node, parent)
-{
- const clone = document.importNode(node, false);
- node.hachette_corresponding = clone;
- /*
- * TODO: Stop page's own issues like "Error parsing a meta element's
- * content:" from appearing as extension's errors.
- */
- parent.hachette_corresponding.appendChild(clone);
-}
-
function extract_cookie_policy(cookie, min_time)
{
let best_result = {time: -1};
@@ -95,6 +86,143 @@ function employ_nonhttp_policy(policy)
location.reload();
}
+/*
+ * 1. When injecting some payload we need to sanitize <meta> CSP tags before
+ * they reach the document.
+ * 2. Only <meta> tags inside <head> are considered valid by the browser and
+ * need to be considered.
+ * 3. We want to detach <html> from document, wait until its <head> completes
+ * loading, sanitize it and re-attach <html>.
+ * 4. Browsers are eager to add <meta>'s that appear after `</head>' but before
+ * `<body>'. Due to this behavior the `DOMContentLoaded' event is considered
+ * unreliable (although it could still work properly, it is just problematic
+ * to verify).
+ * 5. We shall wait for anything to appear in or after <body> and take that as
+ * a sign <head> has _really_ finished loading.
+ */
+
+function make_body_start_observer(DOM_element, waiting)
+{
+ const observer = new MutationObserver(() => try_body_started(waiting));
+ observer.observe(DOM_element, {childList: true});
+ return observer;
+}
+
+function try_body_started(waiting)
+{
+ const body = waiting.detached_html.querySelector("body");
+
+ if ((body && (body.firstChild || body.nextSibling)) ||
+ waiting.doc.documentElement.nextSibling) {
+ finish_waiting(waiting);
+ return true;
+ }
+
+ if (body && waiting.observers.length < 2)
+ waiting.observers.push(make_body_start_observer(body, waiting));
+}
+
+function finish_waiting(waiting)
+{
+ waiting.observers.forEach(observer => observer.disconnect());
+ waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb);
+ setTimeout(waiting.callback, 0);
+}
+
+function _wait_for_head(doc, detached_html, callback)
+{
+ const waiting = {doc, detached_html, callback, observers: []};
+ if (try_body_started(waiting))
+ return;
+
+ waiting.observers = [make_body_start_observer(detached_html, waiting)];
+ waiting.loaded_cb = () => finish_waiting(waiting);
+ doc.addEventListener("DOMContentLoaded", waiting.loaded_cb);
+}
+
+function wait_for_head(doc, detached_html)
+{
+ return new Promise(cb => _wait_for_head(doc, detached_html, cb));
+}
+
+const blocked_str = "blocked";
+
+function block_attribute(node, attr)
+{
+ /*
+ * Disabling attributes this way allows them to still be relatively
+ * easily accessed in case they contain some useful data.
+ */
+ const construct_name = [attr];
+ while (node.hasAttribute(construct_name.join("")))
+ construct_name.unshift(blocked_str);
+
+ while (construct_name.length > 1) {
+ construct_name.shift();
+ const name = construct_name.join("");
+ node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name));
+ }
+
+ node.removeAttribute(attr);
+}
+
+function sanitize_meta(meta, policy)
+{
+ const http_equiv = meta.getAttribute("http-equiv");
+ const value = meta.content;
+
+ if (!value || !is_csp_header_name(http_equiv, true))
+ return;
+
+ block_attribute(meta, "content");
+
+ if (is_csp_header_name(http_equiv, false))
+ meta.content = sanitize_csp_header({value}, policy).value;
+}
+
+function apply_hachette_csp_rules(doc, policy)
+{
+ const meta = doc.createElement("meta");
+ meta.setAttribute("http-equiv", "Content-Security-Policy");
+ meta.setAttribute("content", csp_rule(policy.nonce));
+ doc.head.append(meta);
+ /* CSP is already in effect, we can remove the <meta> now. */
+ meta.remove();
+}
+
+async function sanitize_document(doc, policy)
+{
+ /*
+ * Ensure our CSP rules are employed from the beginning. This CSP injection
+ * method is, when possible, going to be applied together with CSP rules
+ * injected using webRequest.
+ */
+ const has_own_head = doc.head;
+ if (!has_own_head)
+ doc.documentElement.prepend(doc.createElement("head"));
+
+ apply_hachette_csp_rules(doc, policy);
+
+ /* Probably not needed, but...: proceed with DOM in its initial state. */
+ if (!has_own_head)
+ doc.head.remove();
+
+ /*
+ * <html> node gets hijacked now, to be re-attached after <head> is loaded
+ * and sanitized.
+ */
+ const old_html = doc.documentElement;
+ const new_html = doc.createElement("html");
+ old_html.replaceWith(new_html);
+
+ await wait_for_head(doc, old_html);
+
+ for (const meta of old_html.querySelectorAll("head meta"))
+ sanitize_meta(meta, policy);
+
+ new_html.replaceWith(old_html);
+}
+
if (!is_privileged_url(document.URL)) {
let policy_received_callback = () => undefined;
let policy;
@@ -127,25 +255,13 @@ if (!is_privileged_url(document.URL)) {
policy = {allow: false, nonce: gen_nonce()};
}
- handle_page_actions(policy.nonce, policy_received_callback);
-
- if (!policy.allow) {
- if (is_mozilla) {
- const script = document.querySelector("script");
- if (script)
- script.textContent = "throw 'blocked';\n" + script.textContent;
- }
- const old_html = document.documentElement;
- const new_html = document.createElement("html");
- old_html.replaceWith(new_html);
- old_html.hachette_corresponding = new_html;
-
- const modify_end =
- modify_on_the_fly(old_html, policy, {node_eater: accept_node});
- document.addEventListener("DOMContentLoaded", modify_end);
- }
+ const doc_ready = Promise.all([
+ policy.allow ? Promise.resolve : sanitize_document(document, policy),
+ new Promise(cb => document.addEventListener("DOMContentLoaded",
+ cb, {once: true}))
+ ]);
+
+ handle_page_actions(policy.nonce, policy_received_callback, doc_ready);
start_activity_info_server();
}
-
-console.log("content script");
diff --git a/content/page_actions.js b/content/page_actions.js
index 3799afd..8057541 100644
--- a/content/page_actions.js
+++ b/content/page_actions.js
@@ -42,7 +42,7 @@ function handle_message(message)
}
}
-function document_loaded(event)
+function document_ready(event)
{
loaded = true;
@@ -66,13 +66,15 @@ function add_script(script_text)
report_script(script_text);
}
-function handle_page_actions(script_nonce, policy_received_cb) {
+function handle_page_actions(script_nonce, policy_received_cb,
+ doc_ready_promise) {
policy_received_callback = policy_received_cb;
url = document.URL;
is_html = /html/.test(document.contentType);
report_content_type(document.contentType);
- document.addEventListener("DOMContentLoaded", document_loaded);
+ doc_ready_promise.then(document_ready);
+
port = browser.runtime.connect({name : CONNECTION_TYPE.PAGE_ACTIONS});
port.onMessage.addListener(handle_message);
port.postMessage({url});
diff --git a/content/sanitize_document.js b/content/sanitize_document.js
deleted file mode 100644
index 727bb6c..0000000
--- a/content/sanitize_document.js
+++ /dev/null
@@ -1,344 +0,0 @@
-/**
- * Hachette modify HTML document as it loads and reconstruct HTML code from it
- *
- * Copyright (C) 2021 Wojtek Kosior
- * Redistribution terms are gathered in the `copyright' file.
- */
-
-/*
- * IMPORTS_START
- * IMPORT gen_nonce
- * IMPORT csp_rule
- * IMPORT is_csp_header_name
- * IMPORT sanitize_csp_header
- * IMPORT sanitize_attributes
- * IMPORTS_END
- */
-
-/*
- * Functions that sanitize elements. The script blocking measures are, when
- * possible, going to be applied together with CSP rules injected using
- * webRequest.
- */
-
-const blocked = "blocked";
-
-function block_attribute(node, attr)
-{
- /*
- * Disabling attributed this way allows them to still be relatively
- * easily accessed in case they contain some useful data.
- */
-
- const construct_name = [attr];
- while (node.hasAttribute(construct_name.join("")))
- construct_name.unshift(blocked);
-
- while (construct_name.length > 1) {
- construct_name.shift();
- const name = construct_name.join("");
- node.setAttribute(`${blocked}-${name}`, node.getAttribute(name));
- }
-
- node.removeAttribute(attr);
-}
-
-function sanitize_script(script, data)
-{
- if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
- script.remove();
- script.hachette_deleted = true;
- script.hachette_ignore = true;
- }
-
- if (data.policy.allow)
- return;
-
- block_attribute(script, "type");
- script.setAttribute("type", "application/json");
-}
-
-function inject_csp(head, data)
-{
- if (data.policy.allow)
- return;
-
- const meta = document.createElement("meta");
- meta.setAttribute("http-equiv", "Content-Security-Policy");
- meta.setAttribute("content", csp_rule(data.policy.nonce));
- meta.hachette_ignore = true;
- head.prepend(meta);
-
- data.new_added.unshift([meta, head]);
-}
-
-function sanitize_http_equiv_csp_rule(meta, data)
-{
- const http_equiv = meta.getAttribute("http-equiv");
- const value = meta.content;
-
- if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
- return;
-
- block_attribute(meta, "content");
-
- if (data.policy.allow || is_csp_header_name(http_equiv, false))
- meta.content = sanitize_csp_header({value}, data.policy).value;
-}
-
-function sanitize_node(node, data)
-{
- if (node.tagName === "SCRIPT")
- sanitize_script(node, data);
-
- if (node.tagName === "HEAD")
- inject_csp(node, data);
-
- if (node.tagName === "META")
- sanitize_http_equiv_csp_rule(node, data);
-
- if (!data.policy.allow)
- sanitize_attributes(node, data);
-}
-
-/*
- * Instead of calling writer directly with multiple small chunks of reconstruced
- * HTML code, we utilize `setTimeout()' to only have it called once,
- * asynchronously.
- */
-function do_write_callback(data)
-{
- data.writer(data.chunks.join(""));
- data.chunks = [];
-
- if (data.finished && data.finisher)
- data.finisher();
-}
-
-function do_write(chunk, data)
-{
- data.chunks.push(chunk);
- clearTimeout(data.write_timeout);
- data.write_timeout = setTimeout(() => do_write_callback(data), 0);
-}
-
-const serializer = new XMLSerializer();
-
-function start_serializing_node(node, data)
-{
- node.hachette_started = true;
-
- if (!data.writer)
- return;
-
- const clone = node.cloneNode(false);
- clone.textContent = data.uniq;
- do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
-}
-
-function finish_serializing_node(node, data)
-{
- const nodes_to_process = [node];
-
- while (true) {
- node = nodes_to_process.pop();
- if (!node)
- break;
-
- nodes_to_process.push(node, node.hachette_last_added);
- }
-
- while (nodes_to_process.length > 0) {
- const node = nodes_to_process.pop();
- node.remove();
- node.hachette_ignore = true;
-
- if (!data.writer)
- continue;
-
- if (node.hachette_started) {
- node.textContent = data.uniq;
- do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
- continue;
- }
-
- do_write(node.outerHTML || serializer.serializeToString(node), data);
- }
-}
-
-function process_initial_nodes(node, data)
-{
- if (data.processed_initial_nodes)
- return;
-
- data.processed_initial_nodes = true;
-
- start_serializing_node(data.html_root, data);
-
- const new_added = [];
- const nodes_to_process = [data.html_root];
-
- let i = 0;
- while (nodes_to_process.length > 0) {
- let current = nodes_to_process.shift();
-
- if (current.firstChild) {
- if (current.firstChild === node)
- break;
- nodes_to_process.unshift(current.firstChild, current);
- new_added.push([current.firstChild, current]);
- continue;
- }
-
- while (current && !current.nextSibling)
- current = nodes_to_process.shift();
-
- if (!current || current.nextSibling === node)
- break;
-
- nodes_to_process.unshift(current.nextSibling);
- new_added.push([current.nextSibling, nodes_to_process[1]]);
- }
-
- data.new_added.unshift(...new_added);
-}
-
-/*
- * Important! Due to some weirdness node.parentElement is not alway correct
- * in MutationRecords under Chromium. Track node relations manually.
- */
-function handle_added_node(node, true_parent, data)
-{
- /*
- * Functions we call here might cause new nodes to be injected or found
- * that require processing before the one we got in function argument.
- * We rely on those functions putting the node(s) they create/find at the
- * very beginning of the `new_added' queue and (for created nodes) setting
- * their `hachette_ignore' property, based on which their MutationRecord
- * will not be processed. A function can also mark a node already in the
- * `new_added' queue as not eligible for processing by setting its
- * `hachette_deleted' property.
- */
-
- process_initial_nodes(node, data);
-
- data.new_added.push([node, true_parent]);
-
- while (data.new_added.length > 0) {
- [node, true_parent] = data.new_added.shift();
-
- if (true_parent.hachette_deleted)
- node.hachette_deleted = true;
- if (node.hachette_deleted)
- continue;
-
- if (!true_parent.hachette_started)
- start_serializing_node(true_parent, data)
-
- if (!node.hachette_ignore)
- sanitize_node(node, data);
-
- if (node.hachette_deleted)
- continue;
-
- if (data.node_eater)
- data.node_eater(node, true_parent);
-
- finish_serializing_node(true_parent.hachette_last_added, data);
-
- true_parent.hachette_last_added = node;
- }
-}
-
-function handle_mutation(mutations, data)
-{
- /*
- * Chromium: for an unknown reason mutation.target is not always the same as
- * node.parentElement. The former is the correct one.
- */
- for (const mutation of mutations) {
- for (const node of mutation.addedNodes) {
- /* Check for nodes added by ourselves. */
- if (mutation.target.hachette_ignore)
- node.hachette_ignore = true;
- if (node.hachette_ignore)
- continue;
-
- handle_added_node(node, mutation.target, data);
- }
- }
-}
-
-function finish_processing(data)
-{
- process_initial_nodes(undefined, data);
-
- /*
- * The `finisher' callback should be called, if provided. Normally our
- * function that performs the last write does it after seeing `finished'
- * set to `true'. If, however, there's no `writer' callback and hence no
- * writes to perform, we need to take care of calling `finisher' here.
- */
- data.finished = true;
- handle_mutation(data.observer.takeRecords(), data);
- data.observer.disconnect();
-
- /*
- * Additional whitespace that was after `</body>' gets appended to body.
- * Although it's a minor issue, it is not what we want. There's no way to
- * tell exactly what part of that whitespace was after `</body>' and what
- * was before, so we just replace it with a single newline which looks good
- * when printed.
- */
- const body = data.html_root.lastChild;
- const text = body && body.tagName === "BODY" && body.lastChild;
- if (text && text.nodeName === "#text") {
- const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
- text.textContent = new_content + "\n";
- }
-
- finish_serializing_node(data.html_root, data);
- if (!data.writer && data.finisher)
- setTimeout(data.finisher, 0);
-}
-
-/*
- * This function sanitizes `html_root' according to `policy'. It is capable of
- * working on an HTML document that is being written to, sanitizing new nodes
- * as they appear.
- *
- * `consumers' object may contain 3 optional callback functions: `writer',
- * `node_eater' and `finisher'. The first one, if present, is called with chunks
- * of reconstructed HTML code. The second one, if present, gets called for every
- * added node with 2 arguments: that node and its parent. The third one is
- * called at the end, after all processing has been done.
- *
- * `modify_on_the_fly()' returns a callback that should be called (with no
- * arguments) once the document of html_root has finished being written to.
- * Unfortunately, due to specifics behavior of document that has had its
- * documentElement replaced
- */
-function modify_on_the_fly(html_root, policy, consumers)
-{
- const uniq = gen_nonce();
- const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
- const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
- Object.assign(data, consumers);
-
- var observer = new MutationObserver(m => handle_mutation(m, data));
- observer.observe(data.html_root, {
- attributes: true,
- childList: true,
- subtree: true
- });
-
- data.observer = observer;
-
- return () => finish_processing(data);
-}
-
-/*
- * EXPORTS_START
- * EXPORT modify_on_the_fly
- * EXPORTS_END
- */