aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--background/main.js60
-rw-r--r--background/policy_injector.js61
-rw-r--r--background/stream_filter.js176
-rw-r--r--content/main.js5
-rw-r--r--content/sanitize_document.js229
-rw-r--r--copyright7
6 files changed, 437 insertions, 101 deletions
diff --git a/background/main.js b/background/main.js
index 7c50fd5..85f8ce8 100644
--- a/background/main.js
+++ b/background/main.js
@@ -11,18 +11,21 @@
* IMPORT get_storage
* IMPORT start_storage_server
* IMPORT start_page_actions_server
- * IMPORT start_policy_injector
* IMPORT browser
+ * IMPORT is_privileged_url
+ * IMPORT query_best
+ * IMPORT gen_nonce
+ * IMPORT inject_csp_headers
+ * IMPORT apply_stream_filter
+ * IMPORT is_chrome
* IMPORTS_END
*/
start_storage_server();
start_page_actions_server();
-start_policy_injector();
async function init_ext(install_details)
{
- console.log("details:", install_details);
if (install_details.reason != "install")
return;
@@ -44,4 +47,53 @@ async function init_ext(install_details)
browser.runtime.onInstalled.addListener(init_ext);
-console.log("hello, hachette");
+
+let storage;
+
+function on_headers_received(details)
+{
+ const url = details.url;
+ if (is_privileged_url(details.url))
+ return;
+
+ const [pattern, settings] = query_best(storage, details.url);
+ const allow = !!(settings && settings.allow);
+ const nonce = gen_nonce();
+ const policy = {allow, url, nonce};
+
+ let headers = details.responseHeaders;
+ let skip = false;
+ for (const header of headers) {
+ if ((header.name.toLowerCase().trim() === "content-disposition" &&
+ /^\s*attachment\s*(;.*)$/i.test(header.value)))
+ skip = true;
+ }
+
+ headers = inject_csp_headers(details, headers, policy);
+
+ skip = skip || (details.statusCode >= 300 && details.statusCode < 400);
+ if (!skip) {
+ /* Check for API availability. */
+ if (browser.webRequest.filterResponseData)
+ headers = apply_stream_filter(details, headers, policy);
+ }
+
+ return {responseHeaders: headers};
+}
+
+async function start_webRequest_operations()
+{
+ storage = await get_storage();
+
+ const extra_opts = ["blocking", "responseHeaders"];
+ if (is_chrome)
+ extra_opts.push("extraHeaders");
+
+ browser.webRequest.onHeadersReceived.addListener(
+ on_headers_received,
+ {urls: ["<all_urls>"], types: ["main_frame", "sub_frame"]},
+ extra_opts
+ );
+}
+
+start_webRequest_operations();
diff --git a/background/policy_injector.js b/background/policy_injector.js
index 3398b53..1d4db6f 100644
--- a/background/policy_injector.js
+++ b/background/policy_injector.js
@@ -8,36 +8,21 @@
/*
* IMPORTS_START
- * IMPORT get_storage
- * IMPORT browser
- * IMPORT is_chrome
- * IMPORT gen_nonce
- * IMPORT is_privileged_url
* IMPORT sign_data
* IMPORT extract_signed
- * IMPORT query_best
* IMPORT sanitize_csp_header
* IMPORT csp_rule
* IMPORT is_csp_header_name
* IMPORTS_END
*/
-var storage;
-
-function headers_inject(details)
+function inject_csp_headers(details, headers, policy)
{
const url = details.url;
- if (is_privileged_url(url))
- return;
-
- const [pattern, settings] = query_best(storage, url);
- const allow = !!(settings && settings.allow);
- const nonce = gen_nonce();
let orig_csp_headers;
let old_signature;
let hachette_header;
- let headers = details.responseHeaders;
for (const header of headers.filter(h => h.name === "x-hachette")) {
const match = /^([^%])(%.*)$/.exec(header.value);
@@ -50,7 +35,7 @@ function headers_inject(details)
/* Confirmed- it's the originals, smuggled in! */
orig_csp_headers = old_data.csp_headers;
- old_signature = old_data.policy_signature;
+ old_signature = old_data.policy_sig;
hachette_header = header;
break;
@@ -65,21 +50,20 @@ function headers_inject(details)
headers.filter(h => is_csp_header_name(h.name));
/* When blocking remove report-only CSP headers that snitch on us. */
- headers = headers.filter(h => !is_csp_header_name(h.name, !allow));
+ headers = headers.filter(h => !is_csp_header_name(h.name, !policy.allow));
if (old_signature)
headers = headers.filter(h => h.name.search(old_signature) === -1);
- const policy_object = {allow, nonce, url};
- const sanitizer = h => sanitize_csp_header(h, policy_object);
+ const sanitizer = h => sanitize_csp_header(h, policy);
headers.push(...orig_csp_headers.map(sanitizer));
- const policy = encodeURIComponent(JSON.stringify(policy_object));
- const policy_signature = sign_data(policy, new Date());
+ const policy_str = encodeURIComponent(JSON.stringify(policy));
+ const policy_sig = sign_data(policy_str, new Date());
const later_30sec = new Date(new Date().getTime() + 30000).toGMTString();
headers.push({
name: "Set-Cookie",
- value: `hachette-${policy_signature}=${policy}; Expires=${later_30sec};`
+ value: `hachette-${policy_sig}=${policy_str}; Expires=${later_30sec};`
});
/*
@@ -87,37 +71,22 @@ function headers_inject(details)
* These are signed with a time of 0, as it's not clear there is a limit on
* how long Firefox might retain headers in the cache.
*/
- let hachette_data = {csp_headers: orig_csp_headers, policy_signature, url};
+ let hachette_data = {csp_headers: orig_csp_headers, policy_sig, url};
hachette_data = encodeURIComponent(JSON.stringify(hachette_data));
hachette_header.value = sign_data(hachette_data, 0) + hachette_data;
/* To ensure there is a CSP header if required */
- if (!allow)
- headers.push({name: "content-security-policy", value: csp_rule(nonce)});
+ if (!policy.allow)
+ headers.push({
+ name: "content-security-policy",
+ value: csp_rule(policy.nonce)
+ });
- return {responseHeaders: headers};
-}
-
-async function start_policy_injector()
-{
- storage = await get_storage();
-
- let extra_opts = ["blocking", "responseHeaders"];
- if (is_chrome)
- extra_opts.push("extraHeaders");
-
- browser.webRequest.onHeadersReceived.addListener(
- headers_inject,
- {
- urls: ["<all_urls>"],
- types: ["main_frame", "sub_frame"]
- },
- extra_opts
- );
+ return headers;
}
/*
* EXPORTS_START
- * EXPORT start_policy_injector
+ * EXPORT inject_csp_headers
* EXPORTS_END
*/
diff --git a/background/stream_filter.js b/background/stream_filter.js
new file mode 100644
index 0000000..2dce811
--- /dev/null
+++ b/background/stream_filter.js
@@ -0,0 +1,176 @@
+/**
+ * Hachette modifying a web page using the StreamFilter API
+ *
+ * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ *
+ * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
+ * in LibreJS.
+ */
+
+/*
+ * IMPORTS_START
+ * IMPORT browser
+ * IMPORTS_END
+ */
+
+function validate_encoding(charset)
+{
+ try {
+ new TextDecoder();
+ return charset;
+ } catch(e) {
+ return undefined;
+ }
+}
+
+function is_content_type_header(header)
+{
+ header.name.toLowerCase().trim() === "content-type";
+}
+
+const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
+
+function properties_from_headers(headers)
+{
+ const properties = {};
+
+ for (const header of headers.filter(is_content_type_header)) {
+ const match = charset_reg.exec(header.value);
+ if (!properties.detected_charset && validate_encoding(match[1]))
+ properties.detected_charset = match[1];
+
+ if (/html/i.test(header.value))
+ properties.html = true;
+ }
+
+ return properties;
+}
+
+const UTF8_BOM = [0xef, 0xbb, 0xbf];
+const BOMs = [
+ [UTF8_BOM, "utf-8"],
+ [[0xfe, 0xff], "utf-16be"],
+ [[0xff, 0xfe], "utf-16le"]
+];
+
+function charset_from_BOM(data)
+{
+ for (const [BOM, charset] of BOMs) {
+ if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
+ return charset;
+ }
+
+ return "";
+}
+
+const charset_attrs =
+ ['charset', 'http-equiv="content-type"', 'content*="charset"'];
+const charset_meta_selector =
+ charset_attrs.map(a => `head>meta[${a}]`).join(", ");
+
+function charset_from_meta_tags(doc)
+{
+ for (const meta of doc.querySelectorAll(charset_meta_selector)) {
+ const maybe_charset = meta.getAttribute("charset");
+ if (maybe_charset && validate_encoding(maybe_charset))
+ return maybe_charset;
+
+ const match = charset_reg.exec(meta.getAttribute("content"));
+ if (match && validate_encoding(match[1]))
+ return match[1];
+ }
+
+ return undefined;
+}
+
+function create_decoder(properties, data)
+{
+ let charset = charset_from_BOM(data) || properties.detected_charset;
+ if (!charset && data.indexOf(0) !== -1) {
+ console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
+ properties);
+ return new TextDecoder("utf-16be");
+ }
+
+ /* Missing HTTP charset, sniffing in content... */
+ /*
+ * TODO: I recall there is some standard saying how early in the doc the
+ * charset has to be specified. We could process just this part of data.
+ */
+ const text = new TextDecoder("latin1").decode(data, {stream: true});
+ properties.html = properties.html || /html/i.test(text);
+
+ if (properties.html) {
+ const tmp_doc = new DOMParser().parseFromString(text, "text/html");
+ charset = charset_from_meta_tags(tmp_doc);
+ }
+
+ return new TextDecoder(charset || "latin1");
+}
+
+function filter_data(properties, event)
+{
+ const data = new Uint8Array(event.data);
+ let first_chunk = false;
+ if (!properties.decoder) {
+ first_chunk = true;
+ properties.decoder = create_decoder(properties, data);
+ properties.encoder = new TextEncoder();
+ /* Force UTF-8, this is the only encoding we can produce. */
+ properties.filter.write(new Uint8Array(UTF8_BOM));
+ }
+
+ let decoded = properties.decoder.decode(data);
+
+ if (first_chunk) {
+ /*
+ * HAX! Our content scripts that execute at `document_start' will always
+ * run before the first script in the document, but under Mozilla some
+ * `<meta>' tags might already be loaded at that point. Here we inject a
+ * dummy `<script>' at the beginning (before any `<meta>' tags) that
+ * will force `document_start' to happen earlier. This way our content
+ * scripts will be able to sanitize `http-equiv' tags with CSP rules
+ * that would otherwise stop our injected scripts from executing.
+ */
+ const dummy_script =
+ `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
+ const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
+ decoded = doctype_decl + dummy_script +
+ decoded.substring(doctype_decl.length);
+ }
+
+ properties.filter.write(properties.encoder.encode(decoded));
+
+ if (properties.decoder.encoding === "utf-8")
+ properties.filter.disconnect();
+}
+
+function apply_stream_filter(details, headers, policy)
+{
+ if (policy.allow)
+ return headers;
+
+ const properties = properties_from_headers(headers);
+ properties.policy = policy;
+
+ properties.filter =
+ browser.webRequest.filterResponseData(details.requestId);
+
+ properties.filter.ondata = event => filter_data(properties, event);
+ properties.filter.onstop = () => properties.filter.close();
+
+ /*
+ * In the future we might consider modifying the headers that specify
+ * encoding. For now we are not yet doing it, though. However, we
+ * prepend the data with UTF-8 BOM which should be enough.
+ */
+ return headers;
+}
+
+/*
+ * EXPORTS_START
+ * EXPORT apply_stream_filter
+ * EXPORTS_END
+ */
diff --git a/content/main.js b/content/main.js
index 441636c..4ae7738 100644
--- a/content/main.js
+++ b/content/main.js
@@ -47,10 +47,7 @@ if (!is_privileged_url(document.URL)) {
handle_page_actions(policy.nonce);
- if (!policy.allow && is_mozilla)
- addEventListener('beforescriptexecute', mozilla_suppress_scripts, true);
-
- if (!policy.allow && is_chrome) {
+ if (!policy.allow) {
const old_html = document.documentElement;
const new_html = document.createElement("html");
old_html.replaceWith(new_html);
diff --git a/content/sanitize_document.js b/content/sanitize_document.js
index 1533526..727bb6c 100644
--- a/content/sanitize_document.js
+++ b/content/sanitize_document.js
@@ -43,76 +43,100 @@ function block_attribute(node, attr)
node.removeAttribute(attr);
}
-function sanitize_script(script, policy)
+function sanitize_script(script, data)
{
- if (policy.allow)
+ if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
+ script.remove();
+ script.hachette_deleted = true;
+ script.hachette_ignore = true;
+ }
+
+ if (data.policy.allow)
return;
block_attribute(script, "type");
script.setAttribute("type", "application/json");
}
-function inject_csp(head, policy)
+function inject_csp(head, data)
{
- if (policy.allow)
+ if (data.policy.allow)
return;
const meta = document.createElement("meta");
meta.setAttribute("http-equiv", "Content-Security-Policy");
- meta.setAttribute("content", csp_rule(policy.nonce));
+ meta.setAttribute("content", csp_rule(data.policy.nonce));
meta.hachette_ignore = true;
head.prepend(meta);
+
+ data.new_added.unshift([meta, head]);
}
-function sanitize_http_equiv_csp_rule(meta, policy)
+function sanitize_http_equiv_csp_rule(meta, data)
{
const http_equiv = meta.getAttribute("http-equiv");
+ const value = meta.content;
- if (!is_csp_header_name(http_equiv, !policy.allow))
+ if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
return;
- if (policy.allow || is_csp_header_name(http_equiv, false)) {
- let value = meta.getAttribute("content");
- block_attribute(meta, "content");
- if (value) {
- value = sanitize_csp_header({value}, policy).value;
- meta.setAttribute("content", value);
- }
- return;
- }
+ block_attribute(meta, "content");
- block_attribute(meta, "http-equiv");
+ if (data.policy.allow || is_csp_header_name(http_equiv, false))
+ meta.content = sanitize_csp_header({value}, data.policy).value;
}
-function sanitize_node(node, policy)
+function sanitize_node(node, data)
{
if (node.tagName === "SCRIPT")
- sanitize_script(node, policy);
+ sanitize_script(node, data);
if (node.tagName === "HEAD")
- inject_csp(node, policy);
+ inject_csp(node, data);
if (node.tagName === "META")
- sanitize_http_equiv_csp_rule(node, policy);
+ sanitize_http_equiv_csp_rule(node, data);
+
+ if (!data.policy.allow)
+ sanitize_attributes(node, data);
+}
- if (!policy.allow)
- sanitize_attributes(node, policy);
+/*
+ * Instead of calling writer directly with multiple small chunks of reconstruced
+ * HTML code, we utilize `setTimeout()' to only have it called once,
+ * asynchronously.
+ */
+function do_write_callback(data)
+{
+ data.writer(data.chunks.join(""));
+ data.chunks = [];
+
+ if (data.finished && data.finisher)
+ data.finisher();
+}
+
+function do_write(chunk, data)
+{
+ data.chunks.push(chunk);
+ clearTimeout(data.write_timeout);
+ data.write_timeout = setTimeout(() => do_write_callback(data), 0);
}
const serializer = new XMLSerializer();
-function start_node(node, data)
+function start_serializing_node(node, data)
{
+ node.hachette_started = true;
+
if (!data.writer)
return;
- node.hachette_started = true;
const clone = node.cloneNode(false);
clone.textContent = data.uniq;
- data.writer(data.uniq_reg.exec(clone.outerHTML)[1]);
+ do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
}
-function finish_node(node, data)
+function finish_serializing_node(node, data)
{
const nodes_to_process = [node];
@@ -127,40 +151,103 @@ function finish_node(node, data)
while (nodes_to_process.length > 0) {
const node = nodes_to_process.pop();
node.remove();
+ node.hachette_ignore = true;
if (!data.writer)
continue;
if (node.hachette_started) {
node.textContent = data.uniq;
- data.writer(data.uniq_reg.exec(node.outerHTML)[2]);
+ do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
+ continue;
+ }
+
+ do_write(node.outerHTML || serializer.serializeToString(node), data);
+ }
+}
+
+function process_initial_nodes(node, data)
+{
+ if (data.processed_initial_nodes)
+ return;
+
+ data.processed_initial_nodes = true;
+
+ start_serializing_node(data.html_root, data);
+
+ const new_added = [];
+ const nodes_to_process = [data.html_root];
+
+ let i = 0;
+ while (nodes_to_process.length > 0) {
+ let current = nodes_to_process.shift();
+
+ if (current.firstChild) {
+ if (current.firstChild === node)
+ break;
+ nodes_to_process.unshift(current.firstChild, current);
+ new_added.push([current.firstChild, current]);
continue;
}
- data.writer(node.outerHTML || serializer.serializeToString(node));
+ while (current && !current.nextSibling)
+ current = nodes_to_process.shift();
+
+ if (!current || current.nextSibling === node)
+ break;
+
+ nodes_to_process.unshift(current.nextSibling);
+ new_added.push([current.nextSibling, nodes_to_process[1]]);
}
+
+ data.new_added.unshift(...new_added);
}
/*
* Important! Due to some weirdness node.parentElement is not alway correct
- * under Chromium. Track node relations manually.
+ * in MutationRecords under Chromium. Track node relations manually.
*/
function handle_added_node(node, true_parent, data)
{
- if (node.hachette_ignore || true_parent.hachette_ignore)
- return;
+ /*
+ * Functions we call here might cause new nodes to be injected or found
+ * that require processing before the one we got in function argument.
+ * We rely on those functions putting the node(s) they create/find at the
+ * very beginning of the `new_added' queue and (for created nodes) setting
+ * their `hachette_ignore' property, based on which their MutationRecord
+ * will not be processed. A function can also mark a node already in the
+ * `new_added' queue as not eligible for processing by setting its
+ * `hachette_deleted' property.
+ */
- if (!true_parent.hachette_started)
- start_node(true_parent, data)
+ process_initial_nodes(node, data);
- sanitize_node(node, data.policy);
+ data.new_added.push([node, true_parent]);
- if (data.node_eater)
- data.node_eater(node, true_parent);
+ while (data.new_added.length > 0) {
+ [node, true_parent] = data.new_added.shift();
- finish_node(true_parent.hachette_last_added, data);
+ if (true_parent.hachette_deleted)
+ node.hachette_deleted = true;
+ if (node.hachette_deleted)
+ continue;
+
+ if (!true_parent.hachette_started)
+ start_serializing_node(true_parent, data)
+
+ if (!node.hachette_ignore)
+ sanitize_node(node, data);
+
+ if (node.hachette_deleted)
+ continue;
+
+ if (data.node_eater)
+ data.node_eater(node, true_parent);
- true_parent.hachette_last_added = node;
+ finish_serializing_node(true_parent.hachette_last_added, data);
+
+ true_parent.hachette_last_added = node;
+ }
}
function handle_mutation(mutations, data)
@@ -170,28 +257,76 @@ function handle_mutation(mutations, data)
* node.parentElement. The former is the correct one.
*/
for (const mutation of mutations) {
- for (const node of mutation.addedNodes)
+ for (const node of mutation.addedNodes) {
+ /* Check for nodes added by ourselves. */
+ if (mutation.target.hachette_ignore)
+ node.hachette_ignore = true;
+ if (node.hachette_ignore)
+ continue;
+
handle_added_node(node, mutation.target, data);
+ }
}
}
function finish_processing(data)
{
+ process_initial_nodes(undefined, data);
+
+ /*
+ * The `finisher' callback should be called, if provided. Normally our
+ * function that performs the last write does it after seeing `finished'
+ * set to `true'. If, however, there's no `writer' callback and hence no
+ * writes to perform, we need to take care of calling `finisher' here.
+ */
+ data.finished = true;
handle_mutation(data.observer.takeRecords(), data);
- finish_node(data.html_element, data);
data.observer.disconnect();
+
+ /*
+ * Additional whitespace that was after `</body>' gets appended to body.
+ * Although it's a minor issue, it is not what we want. There's no way to
+ * tell exactly what part of that whitespace was after `</body>' and what
+ * was before, so we just replace it with a single newline which looks good
+ * when printed.
+ */
+ const body = data.html_root.lastChild;
+ const text = body && body.tagName === "BODY" && body.lastChild;
+ if (text && text.nodeName === "#text") {
+ const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
+ text.textContent = new_content + "\n";
+ }
+
+ finish_serializing_node(data.html_root, data);
+ if (!data.writer && data.finisher)
+ setTimeout(data.finisher, 0);
}
-function modify_on_the_fly(html_element, policy, consumers)
+/*
+ * This function sanitizes `html_root' according to `policy'. It is capable of
+ * working on an HTML document that is being written to, sanitizing new nodes
+ * as they appear.
+ *
+ * `consumers' object may contain 3 optional callback functions: `writer',
+ * `node_eater' and `finisher'. The first one, if present, is called with chunks
+ * of reconstructed HTML code. The second one, if present, gets called for every
+ * added node with 2 arguments: that node and its parent. The third one is
+ * called at the end, after all processing has been done.
+ *
+ * `modify_on_the_fly()' returns a callback that should be called (with no
+ * arguments) once the document of html_root has finished being written to.
+ * Unfortunately, due to specifics behavior of document that has had its
+ * documentElement replaced
+ */
+function modify_on_the_fly(html_root, policy, consumers)
{
const uniq = gen_nonce();
- const uniq_reg = new RegExp(`^(.*)${uniq}(.*)$`);
- const data = {policy, html_element, uniq, uniq_reg, ...consumers};
-
- start_node(data.html_element, data);
+ const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
+ const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
+ Object.assign(data, consumers);
var observer = new MutationObserver(m => handle_mutation(m, data));
- observer.observe(data.html_element, {
+ observer.observe(data.html_root, {
attributes: true,
childList: true,
subtree: true
diff --git a/copyright b/copyright
index 05a16aa..40126fe 100644
--- a/copyright
+++ b/copyright
@@ -20,6 +20,13 @@ Copyright: 2021 Wojtek Kosior <koszko@koszko.org>
2021 jahoti <jahoti@tilde.team>
License: GPL-3+-javascript or Alicense-1.0
+Files: background/stream_filter.js
+Copyright: 2018 Giorgio Maone <giorgio@maone.net>
+ 2021 Wojtek Kosior <koszko@koszko.org>
+License: GPL-3+-javascript or Alicense-1.0, and GPL-3+
+Comment: Code by Wojtek is dual-licensed under GPL-3+-javascript and
+ Alicense-1.0. Giorgio's code is under GPL-3+.
+
Files: *.html README.txt copyright
Copyright: 2021 Wojtek Kosior <koszko@koszko.org>
License: GPL-3+ or Alicense-1.0 or CC-BY-SA-4.0