/** * This file is part of Haketilo. * * Function: Modifying a web page using the StreamFilter API. * * Copyright (C) 2021, Wojtek Kosior * Copyright (C) 2018, Giorgio Maone <giorgio@maone.net> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * * I, Wojtek Kosior, thereby promise not to sue for violation of this file's * license. Although I request that you do not make use of this code in a * proprietary program, I am not going to enforce this in court. * * * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js' * in LibreJS. */ #FROM common/browser.js IMPORT browser #FROM common/misc.js IMPORT csp_header_regex function validate_encoding(charset) { try { new TextDecoder(charset); return charset; } catch(e) { return undefined; } } function is_content_type_header(header) { return header.name.toLowerCase().trim() === "content-type"; } const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i; function properties_from_headers(headers) { const properties = {}; for (const header of headers.filter(is_content_type_header)) { const match = charset_reg.exec(header.value); if (match && !properties.detected_charset && validate_encoding(match[1])) properties.detected_charset = match[1]; if (/html/i.test(header.value)) properties.html = true; } return properties; } const UTF8_BOM = [0xef, 0xbb, 0xbf]; const BOMs = [ [UTF8_BOM, "utf-8"], [[0xfe, 0xff], "utf-16be"], [[0xff, 0xfe], "utf-16le"] ]; function charset_from_BOM(data) { for (const [BOM, charset] of BOMs) { if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true)) return charset; } return ""; } const charset_attrs = ['charset', 'http-equiv="content-type"', 'content*="charset"']; const charset_meta_selector = charset_attrs.map(a => `head>meta[${a}]`).join(", "); function charset_from_meta_tags(doc) { for (const meta of doc.querySelectorAll(charset_meta_selector)) { const maybe_charset = meta.getAttribute("charset"); if (maybe_charset && validate_encoding(maybe_charset)) return maybe_charset; const match = charset_reg.exec(meta.getAttribute("content")); if (match && validate_encoding(match[1])) return match[1]; } return undefined; } function create_decoder(properties, data) { let charset = charset_from_BOM(data) || properties.detected_charset; if (charset) return new TextDecoder(charset); if (data.indexOf(0) !== -1) { console.warn("Haketilo: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.", properties); return new TextDecoder("utf-16be"); } /* Missing HTTP charset, sniffing in content... */ /* * TODO: I recall there is some standard saying how early in the doc the * charset has to be specified. We could process just this part of data. */ const text = new TextDecoder("latin1").decode(data, {stream: true}); properties.html = properties.html || /html/i.test(text); if (properties.html) { const tmp_doc = new DOMParser().parseFromString(text, "text/html"); charset = charset_from_meta_tags(tmp_doc); } return new TextDecoder(charset || "latin1"); } function may_define_csp_rules(html) { const doc = new DOMParser().parseFromString(html, "text/html"); for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) { if (csp_header_regex.test(meta.httpEquiv) && meta.content) return true; } /* * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML * data could add some. Before we return `false' we need to be sure we * reached the start of `<body>' where `<meta>' tags are no longer valid. */ if (doc.documentElement.nextSibling || doc.body.nextSibling || doc.body.childNodes.length > 1) return false; if (!doc.body.firstChild) return true; if (doc.body.firstChild.nodeName !== "#text") return false; return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText); } function filter_data(properties, event) { const data = new Uint8Array(event.data); let first_chunk = false; if (!properties.decoder) { first_chunk = true; properties.decoder = create_decoder(properties, data); properties.encoder = new TextEncoder(); } let decoded = properties.decoder.decode(data); /* Force UTF-8, this is the only encoding we can produce. */ if (first_chunk) properties.filter.write(new Uint8Array(UTF8_BOM)); if (first_chunk && may_define_csp_rules(decoded)) { /* * HAX! Our content scripts that execute at `document_start' will always * run before the first script in the document, but under Mozilla some * `<meta>' tags might already be loaded at that point. Here we inject a * dummy `<script>' at the beginning (before any `<meta>' tags) that * will force `document_start' to happen earlier. This way our content * scripts will be able to sanitize `http-equiv' tags with CSP rules * that would otherwise stop our injected scripts from executing. * * As we want to only process HTML files that happen to have naughty * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in * `may_define_rules()'. We don't do any additional MIME sniffing as it * is too unreliable (and our heuristic will likely mark non-HTML files * as harmless anyway). */ const dummy_script = `<script>null</script>`; const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0]; decoded = doctype_decl + dummy_script + decoded.substring(doctype_decl.length); } properties.filter.write(properties.encoder.encode(decoded)); if (properties.decoder.encoding === "utf-8") properties.filter.disconnect(); } function apply(details, headers, policy) { if (!policy.payload) return headers; const properties = properties_from_headers(headers); properties.filter = browser.webRequest.filterResponseData(details.requestId); properties.filter.ondata = event => filter_data(properties, event); properties.filter.onstop = () => properties.filter.close(); /* * In the future we might consider modifying the headers that specify * encoding. For now we are not yet doing it, though. However, we * prepend the data with UTF-8 BOM which should be enough. */ return headers; } #EXPORT apply