3 files changed, 247 insertions, 50 deletions
diff --git a/background/main.js b/background/main.js
index 7c50fd5..85f8ce8 100644
--- a/background/main.js
+++ b/background/main.js
@@ -11,18 +11,21 @@
  * IMPORT get_storage
  * IMPORT start_storage_server
  * IMPORT start_page_actions_server
- * IMPORT start_policy_injector
  * IMPORT browser
+ * IMPORT is_privileged_url
+ * IMPORT query_best
+ * IMPORT gen_nonce
+ * IMPORT inject_csp_headers
+ * IMPORT apply_stream_filter
+ * IMPORT is_chrome
  * IMPORTS_END
  */
 
 start_storage_server();
 start_page_actions_server();
-start_policy_injector();
 
 async function init_ext(install_details)
 {
-    console.log("details:", install_details);
     if (install_details.reason != "install")
 	return;
 
@@ -44,4 +47,53 @@ async function init_ext(install_details)
 
 browser.runtime.onInstalled.addListener(init_ext);
 
-console.log("hello, hachette");
+
+let storage;
+
+function on_headers_received(details)
+{
+    const url = details.url;
+    if (is_privileged_url(details.url))
+	return;
+
+    const [pattern, settings] = query_best(storage, details.url);
+    const allow = !!(settings && settings.allow);
+    const nonce = gen_nonce();
+    const policy = {allow, url, nonce};
+
+    let headers = details.responseHeaders;
+    let skip = false;
+    for (const header of headers) {
+	if ((header.name.toLowerCase().trim() === "content-disposition" &&
+	     /^\s*attachment\s*(;.*)$/i.test(header.value)))
+	    skip = true;
+    }
+
+    headers = inject_csp_headers(details, headers, policy);
+
+    skip = skip || (details.statusCode >= 300 && details.statusCode < 400);
+    if (!skip) {
+	/* Check for API availability. */
+	if (browser.webRequest.filterResponseData)
+	    headers = apply_stream_filter(details, headers, policy);
+    }
+
+    return {responseHeaders: headers};
+}
+
+async function start_webRequest_operations()
+{
+    storage = await get_storage();
+
+    const extra_opts = ["blocking", "responseHeaders"];
+    if (is_chrome)
+	extra_opts.push("extraHeaders");
+
+    browser.webRequest.onHeadersReceived.addListener(
+	on_headers_received,
+	{urls: ["<all_urls>"], types: ["main_frame", "sub_frame"]},
+	extra_opts
+    );
+}
+
+start_webRequest_operations();
diff --git a/background/policy_injector.js b/background/policy_injector.js
index 3398b53..1d4db6f 100644
--- a/background/policy_injector.js
+++ b/background/policy_injector.js
@@ -8,36 +8,21 @@
 
 /*
  * IMPORTS_START
- * IMPORT get_storage
- * IMPORT browser
- * IMPORT is_chrome
- * IMPORT gen_nonce
- * IMPORT is_privileged_url
  * IMPORT sign_data
  * IMPORT extract_signed
- * IMPORT query_best
  * IMPORT sanitize_csp_header
  * IMPORT csp_rule
  * IMPORT is_csp_header_name
  * IMPORTS_END
  */
 
-var storage;
-
-function headers_inject(details)
+function inject_csp_headers(details, headers, policy)
 {
     const url = details.url;
-    if (is_privileged_url(url))
-	return;
-
-    const [pattern, settings] = query_best(storage, url);
-    const allow = !!(settings && settings.allow);
-    const nonce = gen_nonce();
 
     let orig_csp_headers;
     let old_signature;
     let hachette_header;
-    let headers = details.responseHeaders;
 
     for (const header of headers.filter(h => h.name === "x-hachette")) {
 	const match = /^([^%])(%.*)$/.exec(header.value);
@@ -50,7 +35,7 @@ function headers_inject(details)
 
 	/* Confirmed- it's the originals, smuggled in! */
 	orig_csp_headers = old_data.csp_headers;
-	old_signature = old_data.policy_signature;
+	old_signature = old_data.policy_sig;
 
 	hachette_header = header;
 	break;
@@ -65,21 +50,20 @@ function headers_inject(details)
 	headers.filter(h => is_csp_header_name(h.name));
 
     /* When blocking remove report-only CSP headers that snitch on us. */
-    headers = headers.filter(h => !is_csp_header_name(h.name, !allow));
+    headers = headers.filter(h => !is_csp_header_name(h.name, !policy.allow));
 
     if (old_signature)
 	headers = headers.filter(h => h.name.search(old_signature) === -1);
 
-    const policy_object = {allow, nonce, url};
-    const sanitizer = h => sanitize_csp_header(h, policy_object);
+    const sanitizer = h => sanitize_csp_header(h, policy);
     headers.push(...orig_csp_headers.map(sanitizer));
 
-    const policy = encodeURIComponent(JSON.stringify(policy_object));
-    const policy_signature = sign_data(policy, new Date());
+    const policy_str = encodeURIComponent(JSON.stringify(policy));
+    const policy_sig = sign_data(policy_str, new Date());
     const later_30sec = new Date(new Date().getTime() + 30000).toGMTString();
     headers.push({
 	name: "Set-Cookie",
-	value: `hachette-${policy_signature}=${policy}; Expires=${later_30sec};`
+	value: `hachette-${policy_sig}=${policy_str}; Expires=${later_30sec};`
     });
 
     /*
@@ -87,37 +71,22 @@ function headers_inject(details)
      * These are signed with a time of 0, as it's not clear there is a limit on
      * how long Firefox might retain headers in the cache.
      */
-    let hachette_data = {csp_headers: orig_csp_headers, policy_signature, url};
+    let hachette_data = {csp_headers: orig_csp_headers, policy_sig, url};
     hachette_data = encodeURIComponent(JSON.stringify(hachette_data));
     hachette_header.value = sign_data(hachette_data, 0) + hachette_data;
 
     /* To ensure there is a CSP header if required */
-    if (!allow)
-	headers.push({name: "content-security-policy", value: csp_rule(nonce)});
+    if (!policy.allow)
+	headers.push({
+	    name: "content-security-policy",
+	    value: csp_rule(policy.nonce)
+	});
 
-    return {responseHeaders: headers};
-}
-
-async function start_policy_injector()
-{
-    storage = await get_storage();
-
-    let extra_opts = ["blocking", "responseHeaders"];
-    if (is_chrome)
-	extra_opts.push("extraHeaders");
-
-    browser.webRequest.onHeadersReceived.addListener(
-	headers_inject,
-	{
-	    urls: ["<all_urls>"],
-	    types: ["main_frame", "sub_frame"]
-	},
-	extra_opts
-    );
+    return headers;
 }
 
 /*
  * EXPORTS_START
- * EXPORT start_policy_injector
+ * EXPORT inject_csp_headers
  * EXPORTS_END
  */
diff --git a/background/stream_filter.js b/background/stream_filter.js
new file mode 100644
index 0000000..2dce811
--- /dev/null
+++ b/background/stream_filter.js
@@ -0,0 +1,176 @@
+/**
+ * Hachette modifying a web page using the StreamFilter API
+ *
+ * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ *
+ * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
+ * in LibreJS.
+ */
+
+/*
+ * IMPORTS_START
+ * IMPORT browser
+ * IMPORTS_END
+ */
+
+function validate_encoding(charset)
+{
+    try {
+	new TextDecoder();
+	return charset;
+    } catch(e) {
+	return undefined;
+    }
+}
+
+function is_content_type_header(header)
+{
+    header.name.toLowerCase().trim() === "content-type";
+}
+
+const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
+
+function properties_from_headers(headers)
+{
+    const properties = {};
+
+    for (const header of headers.filter(is_content_type_header)) {
+	const match = charset_reg.exec(header.value);
+	if (!properties.detected_charset && validate_encoding(match[1]))
+	    properties.detected_charset = match[1];
+
+	if (/html/i.test(header.value))
+	    properties.html = true;
+    }
+
+    return properties;
+}
+
+const UTF8_BOM = [0xef, 0xbb, 0xbf];
+const BOMs = [
+    [UTF8_BOM, "utf-8"],
+    [[0xfe, 0xff], "utf-16be"],
+    [[0xff, 0xfe], "utf-16le"]
+];
+
+function charset_from_BOM(data)
+{
+    for (const [BOM, charset] of BOMs) {
+	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
+	    return charset;
+    }
+
+    return "";
+}
+
+const charset_attrs =
+      ['charset', 'http-equiv="content-type"', 'content*="charset"'];
+const charset_meta_selector =
+      charset_attrs.map(a => `head>meta[${a}]`).join(", ");
+
+function charset_from_meta_tags(doc)
+{
+    for (const meta of doc.querySelectorAll(charset_meta_selector)) {
+	const maybe_charset = meta.getAttribute("charset");
+	if (maybe_charset && validate_encoding(maybe_charset))
+	    return maybe_charset;
+
+        const match = charset_reg.exec(meta.getAttribute("content"));
+        if (match && validate_encoding(match[1]))
+	    return match[1];
+    }
+
+    return undefined;
+}
+
+function create_decoder(properties, data)
+{
+    let charset = charset_from_BOM(data) || properties.detected_charset;
+    if (!charset && data.indexOf(0) !== -1) {
+        console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
+		      properties);
+	return new TextDecoder("utf-16be");
+    }
+
+    /* Missing HTTP charset, sniffing in content... */
+    /*
+     * TODO: I recall there is some standard saying how early in the doc the
+     * charset has to be specified. We could process just this part of data.
+     */
+    const text = new TextDecoder("latin1").decode(data, {stream: true});
+    properties.html = properties.html || /html/i.test(text);
+
+    if (properties.html) {
+	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
+	charset = charset_from_meta_tags(tmp_doc);
+    }
+
+    return new TextDecoder(charset || "latin1");
+}
+
+function filter_data(properties, event)
+{
+    const data = new Uint8Array(event.data);
+    let first_chunk = false;
+    if (!properties.decoder) {
+	first_chunk = true;
+	properties.decoder = create_decoder(properties, data);
+	properties.encoder = new TextEncoder();
+	/* Force UTF-8, this is the only encoding we can produce. */
+	properties.filter.write(new Uint8Array(UTF8_BOM));
+    }
+
+    let decoded = properties.decoder.decode(data);
+
+    if (first_chunk) {
+	/*
+	 * HAX! Our content scripts that execute at `document_start' will always
+	 * run before the first script in the document, but under Mozilla some
+	 * `<meta>' tags might already be loaded at that point. Here we inject a
+	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
+	 * will force `document_start' to happen earlier. This way our content
+	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
+	 * that would otherwise stop our injected scripts from executing.
+	 */
+	const dummy_script =
+	      `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
+	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
+	decoded = doctype_decl + dummy_script +
+	    decoded.substring(doctype_decl.length);
+    }
+
+    properties.filter.write(properties.encoder.encode(decoded));
+
+    if (properties.decoder.encoding === "utf-8")
+	properties.filter.disconnect();
+}
+
+function apply_stream_filter(details, headers, policy)
+{
+    if (policy.allow)
+	return headers;
+
+    const properties = properties_from_headers(headers);
+    properties.policy = policy;
+
+    properties.filter =
+	browser.webRequest.filterResponseData(details.requestId);
+
+    properties.filter.ondata = event => filter_data(properties, event);
+    properties.filter.onstop = () => properties.filter.close();
+
+    /*
+     * In the future we might consider modifying the headers that specify
+     * encoding. For now we are not yet doing it, though. However, we
+     * prepend the data with UTF-8 BOM which should be enough.
+     */
+    return headers;
+}
+
+/*
+ * EXPORTS_START
+ * EXPORT apply_stream_filter
+ * EXPORTS_END
+ */