aboutsummaryrefslogtreecommitdiff
path: root/background/stream_filter.js
diff options
context:
space:
mode:
Diffstat (limited to 'background/stream_filter.js')
-rw-r--r--background/stream_filter.js214
1 files changed, 214 insertions, 0 deletions
diff --git a/background/stream_filter.js b/background/stream_filter.js
new file mode 100644
index 0000000..e5d124c
--- /dev/null
+++ b/background/stream_filter.js
@@ -0,0 +1,214 @@
+/**
+ * This file is part of Haketilo.
+ *
+ * Function: Modifying a web page using the StreamFilter API.
+ *
+ * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
+ * Copyright (C) 2021 Wojtek Kosior
+ * Redistribution terms are gathered in the `copyright' file.
+ *
+ * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
+ * in LibreJS.
+ */
+
+/*
+ * IMPORTS_START
+ * IMPORT browser
+ * IMPORT csp_header_regex
+ * IMPORTS_END
+ */
+
+function validate_encoding(charset)
+{
+ try {
+ new TextDecoder();
+ return charset;
+ } catch(e) {
+ return undefined;
+ }
+}
+
+function is_content_type_header(header)
+{
+ header.name.toLowerCase().trim() === "content-type";
+}
+
+const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
+
+function properties_from_headers(headers)
+{
+ const properties = {};
+
+ for (const header of headers.filter(is_content_type_header)) {
+ const match = charset_reg.exec(header.value);
+ if (!properties.detected_charset && validate_encoding(match[1]))
+ properties.detected_charset = match[1];
+
+ if (/html/i.test(header.value))
+ properties.html = true;
+ }
+
+ return properties;
+}
+
+const UTF8_BOM = [0xef, 0xbb, 0xbf];
+const BOMs = [
+ [UTF8_BOM, "utf-8"],
+ [[0xfe, 0xff], "utf-16be"],
+ [[0xff, 0xfe], "utf-16le"]
+];
+
+function charset_from_BOM(data)
+{
+ for (const [BOM, charset] of BOMs) {
+ if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
+ return charset;
+ }
+
+ return "";
+}
+
+const charset_attrs =
+ ['charset', 'http-equiv="content-type"', 'content*="charset"'];
+const charset_meta_selector =
+ charset_attrs.map(a => `head>meta[${a}]`).join(", ");
+
+function charset_from_meta_tags(doc)
+{
+ for (const meta of doc.querySelectorAll(charset_meta_selector)) {
+ const maybe_charset = meta.getAttribute("charset");
+ if (maybe_charset && validate_encoding(maybe_charset))
+ return maybe_charset;
+
+ const match = charset_reg.exec(meta.getAttribute("content"));
+ if (match && validate_encoding(match[1]))
+ return match[1];
+ }
+
+ return undefined;
+}
+
+function create_decoder(properties, data)
+{
+ let charset = charset_from_BOM(data) || properties.detected_charset;
+ if (!charset && data.indexOf(0) !== -1) {
+ console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
+ properties);
+ return new TextDecoder("utf-16be");
+ }
+
+ /* Missing HTTP charset, sniffing in content... */
+ /*
+ * TODO: I recall there is some standard saying how early in the doc the
+ * charset has to be specified. We could process just this part of data.
+ */
+ const text = new TextDecoder("latin1").decode(data, {stream: true});
+ properties.html = properties.html || /html/i.test(text);
+
+ if (properties.html) {
+ const tmp_doc = new DOMParser().parseFromString(text, "text/html");
+ charset = charset_from_meta_tags(tmp_doc);
+ }
+
+ return new TextDecoder(charset || "latin1");
+}
+
+function may_define_csp_rules(html)
+{
+ const doc = new DOMParser().parseFromString(html, "text/html");
+
+ for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
+ if (csp_header_regex.test(meta.httpEquiv) && meta.content)
+ return true;
+ }
+
+ /*
+ * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
+ * data could add some. Before we return `false' we need to be sure we
+ * reached the start of `<body>' where `<meta>' tags are no longer valid.
+ */
+
+ if (doc.documentElement.nextSibling || doc.body.nextSibling ||
+ doc.body.childNodes.length > 1)
+ return false;
+
+ if (!doc.body.firstChild)
+ return true;
+
+ if (doc.body.firstChild.nodeName !== "#text")
+ return false;
+
+ return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
+}
+
+function filter_data(properties, event)
+{
+ const data = new Uint8Array(event.data);
+ let first_chunk = false;
+ if (!properties.decoder) {
+ first_chunk = true;
+ properties.decoder = create_decoder(properties, data);
+ properties.encoder = new TextEncoder();
+ }
+
+ let decoded = properties.decoder.decode(data);
+
+ /* Force UTF-8, this is the only encoding we can produce. */
+ if (first_chunk)
+ properties.filter.write(new Uint8Array(UTF8_BOM));
+
+ if (first_chunk && may_define_csp_rules(decoded)) {
+ /*
+ * HAX! Our content scripts that execute at `document_start' will always
+ * run before the first script in the document, but under Mozilla some
+ * `<meta>' tags might already be loaded at that point. Here we inject a
+ * dummy `<script>' at the beginning (before any `<meta>' tags) that
+ * will force `document_start' to happen earlier. This way our content
+ * scripts will be able to sanitize `http-equiv' tags with CSP rules
+ * that would otherwise stop our injected scripts from executing.
+ *
+ * As we want to only process HTML files that happen to have naughty
+ * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
+ * `may_define_rules()'. We don't do any additional MIME sniffing as it
+ * is too unreliable (and our heuristic will likely mark non-HTML files
+ * as harmless anyway).
+ */
+
+ const dummy_script = `<script>null</script>`;
+ const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
+ decoded = doctype_decl + dummy_script +
+ decoded.substring(doctype_decl.length);
+ }
+
+ properties.filter.write(properties.encoder.encode(decoded));
+
+ if (properties.decoder.encoding === "utf-8")
+ properties.filter.disconnect();
+}
+
+function apply_stream_filter(details, headers, policy)
+{
+ if (!policy.payload)
+ return headers;
+
+ const properties = properties_from_headers(headers);
+
+ properties.filter =
+ browser.webRequest.filterResponseData(details.requestId);
+
+ properties.filter.ondata = event => filter_data(properties, event);
+ properties.filter.onstop = () => properties.filter.close();
+
+ /*
+ * In the future we might consider modifying the headers that specify
+ * encoding. For now we are not yet doing it, though. However, we
+ * prepend the data with UTF-8 BOM which should be enough.
+ */
+ return headers;
+}
+
+/*
+ * EXPORTS_START
+ * EXPORT apply_stream_filter
+ * EXPORTS_END
+ */