From 6b53d6c840140fc5df6d7638808b978d96502a35 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Mon, 23 Aug 2021 11:05:51 +0200 Subject: use StreamFilter under Mozilla to prevent csp tags from blocking our injected scripts --- background/stream_filter.js | 176 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 background/stream_filter.js (limited to 'background/stream_filter.js') diff --git a/background/stream_filter.js b/background/stream_filter.js new file mode 100644 index 0000000..2dce811 --- /dev/null +++ b/background/stream_filter.js @@ -0,0 +1,176 @@ +/** + * Hachette modifying a web page using the StreamFilter API + * + * Copyright (C) 2018 Giorgio Maone + * Copyright (C) 2021 Wojtek Kosior + * Redistribution terms are gathered in the `copyright' file. + * + * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js' + * in LibreJS. + */ + +/* + * IMPORTS_START + * IMPORT browser + * IMPORTS_END + */ + +function validate_encoding(charset) +{ + try { + new TextDecoder(); + return charset; + } catch(e) { + return undefined; + } +} + +function is_content_type_header(header) +{ + header.name.toLowerCase().trim() === "content-type"; +} + +const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i; + +function properties_from_headers(headers) +{ + const properties = {}; + + for (const header of headers.filter(is_content_type_header)) { + const match = charset_reg.exec(header.value); + if (!properties.detected_charset && validate_encoding(match[1])) + properties.detected_charset = match[1]; + + if (/html/i.test(header.value)) + properties.html = true; + } + + return properties; +} + +const UTF8_BOM = [0xef, 0xbb, 0xbf]; +const BOMs = [ + [UTF8_BOM, "utf-8"], + [[0xfe, 0xff], "utf-16be"], + [[0xff, 0xfe], "utf-16le"] +]; + +function charset_from_BOM(data) +{ + for (const [BOM, charset] of BOMs) { + if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true)) + return charset; + } + + return ""; +} + +const charset_attrs = + ['charset', 'http-equiv="content-type"', 'content*="charset"']; +const charset_meta_selector = + charset_attrs.map(a => `head>meta[${a}]`).join(", "); + +function charset_from_meta_tags(doc) +{ + for (const meta of doc.querySelectorAll(charset_meta_selector)) { + const maybe_charset = meta.getAttribute("charset"); + if (maybe_charset && validate_encoding(maybe_charset)) + return maybe_charset; + + const match = charset_reg.exec(meta.getAttribute("content")); + if (match && validate_encoding(match[1])) + return match[1]; + } + + return undefined; +} + +function create_decoder(properties, data) +{ + let charset = charset_from_BOM(data) || properties.detected_charset; + if (!charset && data.indexOf(0) !== -1) { + console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.", + properties); + return new TextDecoder("utf-16be"); + } + + /* Missing HTTP charset, sniffing in content... */ + /* + * TODO: I recall there is some standard saying how early in the doc the + * charset has to be specified. We could process just this part of data. + */ + const text = new TextDecoder("latin1").decode(data, {stream: true}); + properties.html = properties.html || /html/i.test(text); + + if (properties.html) { + const tmp_doc = new DOMParser().parseFromString(text, "text/html"); + charset = charset_from_meta_tags(tmp_doc); + } + + return new TextDecoder(charset || "latin1"); +} + +function filter_data(properties, event) +{ + const data = new Uint8Array(event.data); + let first_chunk = false; + if (!properties.decoder) { + first_chunk = true; + properties.decoder = create_decoder(properties, data); + properties.encoder = new TextEncoder(); + /* Force UTF-8, this is the only encoding we can produce. */ + properties.filter.write(new Uint8Array(UTF8_BOM)); + } + + let decoded = properties.decoder.decode(data); + + if (first_chunk) { + /* + * HAX! Our content scripts that execute at `document_start' will always + * run before the first script in the document, but under Mozilla some + * `' tags might already be loaded at that point. Here we inject a + * dummy ``; + const doctype_decl = /^(\s*"']*>)?/i.exec(decoded)[0]; + decoded = doctype_decl + dummy_script + + decoded.substring(doctype_decl.length); + } + + properties.filter.write(properties.encoder.encode(decoded)); + + if (properties.decoder.encoding === "utf-8") + properties.filter.disconnect(); +} + +function apply_stream_filter(details, headers, policy) +{ + if (policy.allow) + return headers; + + const properties = properties_from_headers(headers); + properties.policy = policy; + + properties.filter = + browser.webRequest.filterResponseData(details.requestId); + + properties.filter.ondata = event => filter_data(properties, event); + properties.filter.onstop = () => properties.filter.close(); + + /* + * In the future we might consider modifying the headers that specify + * encoding. For now we are not yet doing it, though. However, we + * prepend the data with UTF-8 BOM which should be enough. + */ + return headers; +} + +/* + * EXPORTS_START + * EXPORT apply_stream_filter + * EXPORTS_END + */ -- cgit v1.2.3