diff options
-rw-r--r-- | background/main.js | 60 | ||||
-rw-r--r-- | background/policy_injector.js | 61 | ||||
-rw-r--r-- | background/stream_filter.js | 176 | ||||
-rw-r--r-- | content/main.js | 5 | ||||
-rw-r--r-- | content/sanitize_document.js | 229 | ||||
-rw-r--r-- | copyright | 7 |
6 files changed, 437 insertions, 101 deletions
diff --git a/background/main.js b/background/main.js index 7c50fd5..85f8ce8 100644 --- a/background/main.js +++ b/background/main.js @@ -11,18 +11,21 @@ * IMPORT get_storage * IMPORT start_storage_server * IMPORT start_page_actions_server - * IMPORT start_policy_injector * IMPORT browser + * IMPORT is_privileged_url + * IMPORT query_best + * IMPORT gen_nonce + * IMPORT inject_csp_headers + * IMPORT apply_stream_filter + * IMPORT is_chrome * IMPORTS_END */ start_storage_server(); start_page_actions_server(); -start_policy_injector(); async function init_ext(install_details) { - console.log("details:", install_details); if (install_details.reason != "install") return; @@ -44,4 +47,53 @@ async function init_ext(install_details) browser.runtime.onInstalled.addListener(init_ext); -console.log("hello, hachette"); + +let storage; + +function on_headers_received(details) +{ + const url = details.url; + if (is_privileged_url(details.url)) + return; + + const [pattern, settings] = query_best(storage, details.url); + const allow = !!(settings && settings.allow); + const nonce = gen_nonce(); + const policy = {allow, url, nonce}; + + let headers = details.responseHeaders; + let skip = false; + for (const header of headers) { + if ((header.name.toLowerCase().trim() === "content-disposition" && + /^\s*attachment\s*(;.*)$/i.test(header.value))) + skip = true; + } + + headers = inject_csp_headers(details, headers, policy); + + skip = skip || (details.statusCode >= 300 && details.statusCode < 400); + if (!skip) { + /* Check for API availability. */ + if (browser.webRequest.filterResponseData) + headers = apply_stream_filter(details, headers, policy); + } + + return {responseHeaders: headers}; +} + +async function start_webRequest_operations() +{ + storage = await get_storage(); + + const extra_opts = ["blocking", "responseHeaders"]; + if (is_chrome) + extra_opts.push("extraHeaders"); + + browser.webRequest.onHeadersReceived.addListener( + on_headers_received, + {urls: ["<all_urls>"], types: ["main_frame", "sub_frame"]}, + extra_opts + ); +} + +start_webRequest_operations(); diff --git a/background/policy_injector.js b/background/policy_injector.js index 3398b53..1d4db6f 100644 --- a/background/policy_injector.js +++ b/background/policy_injector.js @@ -8,36 +8,21 @@ /* * IMPORTS_START - * IMPORT get_storage - * IMPORT browser - * IMPORT is_chrome - * IMPORT gen_nonce - * IMPORT is_privileged_url * IMPORT sign_data * IMPORT extract_signed - * IMPORT query_best * IMPORT sanitize_csp_header * IMPORT csp_rule * IMPORT is_csp_header_name * IMPORTS_END */ -var storage; - -function headers_inject(details) +function inject_csp_headers(details, headers, policy) { const url = details.url; - if (is_privileged_url(url)) - return; - - const [pattern, settings] = query_best(storage, url); - const allow = !!(settings && settings.allow); - const nonce = gen_nonce(); let orig_csp_headers; let old_signature; let hachette_header; - let headers = details.responseHeaders; for (const header of headers.filter(h => h.name === "x-hachette")) { const match = /^([^%])(%.*)$/.exec(header.value); @@ -50,7 +35,7 @@ function headers_inject(details) /* Confirmed- it's the originals, smuggled in! */ orig_csp_headers = old_data.csp_headers; - old_signature = old_data.policy_signature; + old_signature = old_data.policy_sig; hachette_header = header; break; @@ -65,21 +50,20 @@ function headers_inject(details) headers.filter(h => is_csp_header_name(h.name)); /* When blocking remove report-only CSP headers that snitch on us. */ - headers = headers.filter(h => !is_csp_header_name(h.name, !allow)); + headers = headers.filter(h => !is_csp_header_name(h.name, !policy.allow)); if (old_signature) headers = headers.filter(h => h.name.search(old_signature) === -1); - const policy_object = {allow, nonce, url}; - const sanitizer = h => sanitize_csp_header(h, policy_object); + const sanitizer = h => sanitize_csp_header(h, policy); headers.push(...orig_csp_headers.map(sanitizer)); - const policy = encodeURIComponent(JSON.stringify(policy_object)); - const policy_signature = sign_data(policy, new Date()); + const policy_str = encodeURIComponent(JSON.stringify(policy)); + const policy_sig = sign_data(policy_str, new Date()); const later_30sec = new Date(new Date().getTime() + 30000).toGMTString(); headers.push({ name: "Set-Cookie", - value: `hachette-${policy_signature}=${policy}; Expires=${later_30sec};` + value: `hachette-${policy_sig}=${policy_str}; Expires=${later_30sec};` }); /* @@ -87,37 +71,22 @@ function headers_inject(details) * These are signed with a time of 0, as it's not clear there is a limit on * how long Firefox might retain headers in the cache. */ - let hachette_data = {csp_headers: orig_csp_headers, policy_signature, url}; + let hachette_data = {csp_headers: orig_csp_headers, policy_sig, url}; hachette_data = encodeURIComponent(JSON.stringify(hachette_data)); hachette_header.value = sign_data(hachette_data, 0) + hachette_data; /* To ensure there is a CSP header if required */ - if (!allow) - headers.push({name: "content-security-policy", value: csp_rule(nonce)}); + if (!policy.allow) + headers.push({ + name: "content-security-policy", + value: csp_rule(policy.nonce) + }); - return {responseHeaders: headers}; -} - -async function start_policy_injector() -{ - storage = await get_storage(); - - let extra_opts = ["blocking", "responseHeaders"]; - if (is_chrome) - extra_opts.push("extraHeaders"); - - browser.webRequest.onHeadersReceived.addListener( - headers_inject, - { - urls: ["<all_urls>"], - types: ["main_frame", "sub_frame"] - }, - extra_opts - ); + return headers; } /* * EXPORTS_START - * EXPORT start_policy_injector + * EXPORT inject_csp_headers * EXPORTS_END */ diff --git a/background/stream_filter.js b/background/stream_filter.js new file mode 100644 index 0000000..2dce811 --- /dev/null +++ b/background/stream_filter.js @@ -0,0 +1,176 @@ +/** + * Hachette modifying a web page using the StreamFilter API + * + * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net> + * Copyright (C) 2021 Wojtek Kosior + * Redistribution terms are gathered in the `copyright' file. + * + * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js' + * in LibreJS. + */ + +/* + * IMPORTS_START + * IMPORT browser + * IMPORTS_END + */ + +function validate_encoding(charset) +{ + try { + new TextDecoder(); + return charset; + } catch(e) { + return undefined; + } +} + +function is_content_type_header(header) +{ + header.name.toLowerCase().trim() === "content-type"; +} + +const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i; + +function properties_from_headers(headers) +{ + const properties = {}; + + for (const header of headers.filter(is_content_type_header)) { + const match = charset_reg.exec(header.value); + if (!properties.detected_charset && validate_encoding(match[1])) + properties.detected_charset = match[1]; + + if (/html/i.test(header.value)) + properties.html = true; + } + + return properties; +} + +const UTF8_BOM = [0xef, 0xbb, 0xbf]; +const BOMs = [ + [UTF8_BOM, "utf-8"], + [[0xfe, 0xff], "utf-16be"], + [[0xff, 0xfe], "utf-16le"] +]; + +function charset_from_BOM(data) +{ + for (const [BOM, charset] of BOMs) { + if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true)) + return charset; + } + + return ""; +} + +const charset_attrs = + ['charset', 'http-equiv="content-type"', 'content*="charset"']; +const charset_meta_selector = + charset_attrs.map(a => `head>meta[${a}]`).join(", "); + +function charset_from_meta_tags(doc) +{ + for (const meta of doc.querySelectorAll(charset_meta_selector)) { + const maybe_charset = meta.getAttribute("charset"); + if (maybe_charset && validate_encoding(maybe_charset)) + return maybe_charset; + + const match = charset_reg.exec(meta.getAttribute("content")); + if (match && validate_encoding(match[1])) + return match[1]; + } + + return undefined; +} + +function create_decoder(properties, data) +{ + let charset = charset_from_BOM(data) || properties.detected_charset; + if (!charset && data.indexOf(0) !== -1) { + console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.", + properties); + return new TextDecoder("utf-16be"); + } + + /* Missing HTTP charset, sniffing in content... */ + /* + * TODO: I recall there is some standard saying how early in the doc the + * charset has to be specified. We could process just this part of data. + */ + const text = new TextDecoder("latin1").decode(data, {stream: true}); + properties.html = properties.html || /html/i.test(text); + + if (properties.html) { + const tmp_doc = new DOMParser().parseFromString(text, "text/html"); + charset = charset_from_meta_tags(tmp_doc); + } + + return new TextDecoder(charset || "latin1"); +} + +function filter_data(properties, event) +{ + const data = new Uint8Array(event.data); + let first_chunk = false; + if (!properties.decoder) { + first_chunk = true; + properties.decoder = create_decoder(properties, data); + properties.encoder = new TextEncoder(); + /* Force UTF-8, this is the only encoding we can produce. */ + properties.filter.write(new Uint8Array(UTF8_BOM)); + } + + let decoded = properties.decoder.decode(data); + + if (first_chunk) { + /* + * HAX! Our content scripts that execute at `document_start' will always + * run before the first script in the document, but under Mozilla some + * `<meta>' tags might already be loaded at that point. Here we inject a + * dummy `<script>' at the beginning (before any `<meta>' tags) that + * will force `document_start' to happen earlier. This way our content + * scripts will be able to sanitize `http-equiv' tags with CSP rules + * that would otherwise stop our injected scripts from executing. + */ + const dummy_script = + `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`; + const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0]; + decoded = doctype_decl + dummy_script + + decoded.substring(doctype_decl.length); + } + + properties.filter.write(properties.encoder.encode(decoded)); + + if (properties.decoder.encoding === "utf-8") + properties.filter.disconnect(); +} + +function apply_stream_filter(details, headers, policy) +{ + if (policy.allow) + return headers; + + const properties = properties_from_headers(headers); + properties.policy = policy; + + properties.filter = + browser.webRequest.filterResponseData(details.requestId); + + properties.filter.ondata = event => filter_data(properties, event); + properties.filter.onstop = () => properties.filter.close(); + + /* + * In the future we might consider modifying the headers that specify + * encoding. For now we are not yet doing it, though. However, we + * prepend the data with UTF-8 BOM which should be enough. + */ + return headers; +} + +/* + * EXPORTS_START + * EXPORT apply_stream_filter + * EXPORTS_END + */ diff --git a/content/main.js b/content/main.js index 441636c..4ae7738 100644 --- a/content/main.js +++ b/content/main.js @@ -47,10 +47,7 @@ if (!is_privileged_url(document.URL)) { handle_page_actions(policy.nonce); - if (!policy.allow && is_mozilla) - addEventListener('beforescriptexecute', mozilla_suppress_scripts, true); - - if (!policy.allow && is_chrome) { + if (!policy.allow) { const old_html = document.documentElement; const new_html = document.createElement("html"); old_html.replaceWith(new_html); diff --git a/content/sanitize_document.js b/content/sanitize_document.js index 1533526..727bb6c 100644 --- a/content/sanitize_document.js +++ b/content/sanitize_document.js @@ -43,76 +43,100 @@ function block_attribute(node, attr) node.removeAttribute(attr); } -function sanitize_script(script, policy) +function sanitize_script(script, data) { - if (policy.allow) + if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) { + script.remove(); + script.hachette_deleted = true; + script.hachette_ignore = true; + } + + if (data.policy.allow) return; block_attribute(script, "type"); script.setAttribute("type", "application/json"); } -function inject_csp(head, policy) +function inject_csp(head, data) { - if (policy.allow) + if (data.policy.allow) return; const meta = document.createElement("meta"); meta.setAttribute("http-equiv", "Content-Security-Policy"); - meta.setAttribute("content", csp_rule(policy.nonce)); + meta.setAttribute("content", csp_rule(data.policy.nonce)); meta.hachette_ignore = true; head.prepend(meta); + + data.new_added.unshift([meta, head]); } -function sanitize_http_equiv_csp_rule(meta, policy) +function sanitize_http_equiv_csp_rule(meta, data) { const http_equiv = meta.getAttribute("http-equiv"); + const value = meta.content; - if (!is_csp_header_name(http_equiv, !policy.allow)) + if (!value || !is_csp_header_name(http_equiv, !data.policy.allow)) return; - if (policy.allow || is_csp_header_name(http_equiv, false)) { - let value = meta.getAttribute("content"); - block_attribute(meta, "content"); - if (value) { - value = sanitize_csp_header({value}, policy).value; - meta.setAttribute("content", value); - } - return; - } + block_attribute(meta, "content"); - block_attribute(meta, "http-equiv"); + if (data.policy.allow || is_csp_header_name(http_equiv, false)) + meta.content = sanitize_csp_header({value}, data.policy).value; } -function sanitize_node(node, policy) +function sanitize_node(node, data) { if (node.tagName === "SCRIPT") - sanitize_script(node, policy); + sanitize_script(node, data); if (node.tagName === "HEAD") - inject_csp(node, policy); + inject_csp(node, data); if (node.tagName === "META") - sanitize_http_equiv_csp_rule(node, policy); + sanitize_http_equiv_csp_rule(node, data); + + if (!data.policy.allow) + sanitize_attributes(node, data); +} - if (!policy.allow) - sanitize_attributes(node, policy); +/* + * Instead of calling writer directly with multiple small chunks of reconstruced + * HTML code, we utilize `setTimeout()' to only have it called once, + * asynchronously. + */ +function do_write_callback(data) +{ + data.writer(data.chunks.join("")); + data.chunks = []; + + if (data.finished && data.finisher) + data.finisher(); +} + +function do_write(chunk, data) +{ + data.chunks.push(chunk); + clearTimeout(data.write_timeout); + data.write_timeout = setTimeout(() => do_write_callback(data), 0); } const serializer = new XMLSerializer(); -function start_node(node, data) +function start_serializing_node(node, data) { + node.hachette_started = true; + if (!data.writer) return; - node.hachette_started = true; const clone = node.cloneNode(false); clone.textContent = data.uniq; - data.writer(data.uniq_reg.exec(clone.outerHTML)[1]); + do_write(data.uniq_reg.exec(clone.outerHTML)[1], data); } -function finish_node(node, data) +function finish_serializing_node(node, data) { const nodes_to_process = [node]; @@ -127,40 +151,103 @@ function finish_node(node, data) while (nodes_to_process.length > 0) { const node = nodes_to_process.pop(); node.remove(); + node.hachette_ignore = true; if (!data.writer) continue; if (node.hachette_started) { node.textContent = data.uniq; - data.writer(data.uniq_reg.exec(node.outerHTML)[2]); + do_write(data.uniq_reg.exec(node.outerHTML)[2], data); + continue; + } + + do_write(node.outerHTML || serializer.serializeToString(node), data); + } +} + +function process_initial_nodes(node, data) +{ + if (data.processed_initial_nodes) + return; + + data.processed_initial_nodes = true; + + start_serializing_node(data.html_root, data); + + const new_added = []; + const nodes_to_process = [data.html_root]; + + let i = 0; + while (nodes_to_process.length > 0) { + let current = nodes_to_process.shift(); + + if (current.firstChild) { + if (current.firstChild === node) + break; + nodes_to_process.unshift(current.firstChild, current); + new_added.push([current.firstChild, current]); continue; } - data.writer(node.outerHTML || serializer.serializeToString(node)); + while (current && !current.nextSibling) + current = nodes_to_process.shift(); + + if (!current || current.nextSibling === node) + break; + + nodes_to_process.unshift(current.nextSibling); + new_added.push([current.nextSibling, nodes_to_process[1]]); } + + data.new_added.unshift(...new_added); } /* * Important! Due to some weirdness node.parentElement is not alway correct - * under Chromium. Track node relations manually. + * in MutationRecords under Chromium. Track node relations manually. */ function handle_added_node(node, true_parent, data) { - if (node.hachette_ignore || true_parent.hachette_ignore) - return; + /* + * Functions we call here might cause new nodes to be injected or found + * that require processing before the one we got in function argument. + * We rely on those functions putting the node(s) they create/find at the + * very beginning of the `new_added' queue and (for created nodes) setting + * their `hachette_ignore' property, based on which their MutationRecord + * will not be processed. A function can also mark a node already in the + * `new_added' queue as not eligible for processing by setting its + * `hachette_deleted' property. + */ - if (!true_parent.hachette_started) - start_node(true_parent, data) + process_initial_nodes(node, data); - sanitize_node(node, data.policy); + data.new_added.push([node, true_parent]); - if (data.node_eater) - data.node_eater(node, true_parent); + while (data.new_added.length > 0) { + [node, true_parent] = data.new_added.shift(); - finish_node(true_parent.hachette_last_added, data); + if (true_parent.hachette_deleted) + node.hachette_deleted = true; + if (node.hachette_deleted) + continue; + + if (!true_parent.hachette_started) + start_serializing_node(true_parent, data) + + if (!node.hachette_ignore) + sanitize_node(node, data); + + if (node.hachette_deleted) + continue; + + if (data.node_eater) + data.node_eater(node, true_parent); - true_parent.hachette_last_added = node; + finish_serializing_node(true_parent.hachette_last_added, data); + + true_parent.hachette_last_added = node; + } } function handle_mutation(mutations, data) @@ -170,28 +257,76 @@ function handle_mutation(mutations, data) * node.parentElement. The former is the correct one. */ for (const mutation of mutations) { - for (const node of mutation.addedNodes) + for (const node of mutation.addedNodes) { + /* Check for nodes added by ourselves. */ + if (mutation.target.hachette_ignore) + node.hachette_ignore = true; + if (node.hachette_ignore) + continue; + handle_added_node(node, mutation.target, data); + } } } function finish_processing(data) { + process_initial_nodes(undefined, data); + + /* + * The `finisher' callback should be called, if provided. Normally our + * function that performs the last write does it after seeing `finished' + * set to `true'. If, however, there's no `writer' callback and hence no + * writes to perform, we need to take care of calling `finisher' here. + */ + data.finished = true; handle_mutation(data.observer.takeRecords(), data); - finish_node(data.html_element, data); data.observer.disconnect(); + + /* + * Additional whitespace that was after `</body>' gets appended to body. + * Although it's a minor issue, it is not what we want. There's no way to + * tell exactly what part of that whitespace was after `</body>' and what + * was before, so we just replace it with a single newline which looks good + * when printed. + */ + const body = data.html_root.lastChild; + const text = body && body.tagName === "BODY" && body.lastChild; + if (text && text.nodeName === "#text") { + const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || ""; + text.textContent = new_content + "\n"; + } + + finish_serializing_node(data.html_root, data); + if (!data.writer && data.finisher) + setTimeout(data.finisher, 0); } -function modify_on_the_fly(html_element, policy, consumers) +/* + * This function sanitizes `html_root' according to `policy'. It is capable of + * working on an HTML document that is being written to, sanitizing new nodes + * as they appear. + * + * `consumers' object may contain 3 optional callback functions: `writer', + * `node_eater' and `finisher'. The first one, if present, is called with chunks + * of reconstructed HTML code. The second one, if present, gets called for every + * added node with 2 arguments: that node and its parent. The third one is + * called at the end, after all processing has been done. + * + * `modify_on_the_fly()' returns a callback that should be called (with no + * arguments) once the document of html_root has finished being written to. + * Unfortunately, due to specifics behavior of document that has had its + * documentElement replaced + */ +function modify_on_the_fly(html_root, policy, consumers) { const uniq = gen_nonce(); - const uniq_reg = new RegExp(`^(.*)${uniq}(.*)$`); - const data = {policy, html_element, uniq, uniq_reg, ...consumers}; - - start_node(data.html_element, data); + const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`); + const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []}; + Object.assign(data, consumers); var observer = new MutationObserver(m => handle_mutation(m, data)); - observer.observe(data.html_element, { + observer.observe(data.html_root, { attributes: true, childList: true, subtree: true @@ -20,6 +20,13 @@ Copyright: 2021 Wojtek Kosior <koszko@koszko.org> 2021 jahoti <jahoti@tilde.team> License: GPL-3+-javascript or Alicense-1.0 +Files: background/stream_filter.js +Copyright: 2018 Giorgio Maone <giorgio@maone.net> + 2021 Wojtek Kosior <koszko@koszko.org> +License: GPL-3+-javascript or Alicense-1.0, and GPL-3+ +Comment: Code by Wojtek is dual-licensed under GPL-3+-javascript and + Alicense-1.0. Giorgio's code is under GPL-3+. + Files: *.html README.txt copyright Copyright: 2021 Wojtek Kosior <koszko@koszko.org> License: GPL-3+ or Alicense-1.0 or CC-BY-SA-4.0 |