/** * This file is part of Haketilo. * * Function: Modifying a web page using the StreamFilter API. * * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net> * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. * * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js' * in LibreJS. */ /* * IMPORTS_START * IMPORT browser * IMPORT csp_header_regex * IMPORTS_END */ function validate_encoding(charset) { try { new TextDecoder(); return charset; } catch(e) { return undefined; } } function is_content_type_header(header) { header.name.toLowerCase().trim() === "content-type"; } const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i; function properties_from_headers(headers) { const properties = {}; for (const header of headers.filter(is_content_type_header)) { const match = charset_reg.exec(header.value); if (!properties.detected_charset && validate_encoding(match[1])) properties.detected_charset = match[1]; if (/html/i.test(header.value)) properties.html = true; } return properties; } const UTF8_BOM = [0xef, 0xbb, 0xbf]; const BOMs = [ [UTF8_BOM, "utf-8"], [[0xfe, 0xff], "utf-16be"], [[0xff, 0xfe], "utf-16le"] ]; function charset_from_BOM(data) { for (const [BOM, charset] of BOMs) { if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true)) return charset; } return ""; } const charset_attrs = ['charset', 'http-equiv="content-type"', 'content*="charset"']; const charset_meta_selector = charset_attrs.map(a => `head>meta[${a}]`).join(", "); function charset_from_meta_tags(doc) { for (const meta of doc.querySelectorAll(charset_meta_selector)) { const maybe_charset = meta.getAttribute("charset"); if (maybe_charset && validate_encoding(maybe_charset)) return maybe_charset; const match = charset_reg.exec(meta.getAttribute("content")); if (match && validate_encoding(match[1])) return match[1]; } return undefined; } function create_decoder(properties, data) { let charset = charset_from_BOM(data) || properties.detected_charset; if (!charset && data.indexOf(0) !== -1) { console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.", properties); return new TextDecoder("utf-16be"); } /* Missing HTTP charset, sniffing in content... */ /* * TODO: I recall there is some standard saying how early in the doc the * charset has to be specified. We could process just this part of data. */ const text = new TextDecoder("latin1").decode(data, {stream: true}); properties.html = properties.html || /html/i.test(text); if (properties.html) { const tmp_doc = new DOMParser().parseFromString(text, "text/html"); charset = charset_from_meta_tags(tmp_doc); } return new TextDecoder(charset || "latin1"); } function may_define_csp_rules(html) { const doc = new DOMParser().parseFromString(html, "text/html"); for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) { if (csp_header_regex.test(meta.httpEquiv) && meta.content) return true; } /* * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML * data could add some. Before we return `false' we need to be sure we * reached the start of `<body>' where `<meta>' tags are no longer valid. */ if (doc.documentElement.nextSibling || doc.body.nextSibling || doc.body.childNodes.length > 1) return false; if (!doc.body.firstChild) return true; if (doc.body.firstChild.nodeName !== "#text") return false; return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText); } function filter_data(properties, event) { const data = new Uint8Array(event.data); let first_chunk = false; if (!properties.decoder) { first_chunk = true; properties.decoder = create_decoder(properties, data); properties.encoder = new TextEncoder(); } let decoded = properties.decoder.decode(data); /* Force UTF-8, this is the only encoding we can produce. */ if (first_chunk) properties.filter.write(new Uint8Array(UTF8_BOM)); if (first_chunk && may_define_csp_rules(decoded)) { /* * HAX! Our content scripts that execute at `document_start' will always * run before the first script in the document, but under Mozilla some * `<meta>' tags might already be loaded at that point. Here we inject a * dummy `<script>' at the beginning (before any `<meta>' tags) that * will force `document_start' to happen earlier. This way our content * scripts will be able to sanitize `http-equiv' tags with CSP rules * that would otherwise stop our injected scripts from executing. * * As we want to only process HTML files that happen to have naughty * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in * `may_define_rules()'. We don't do any additional MIME sniffing as it * is too unreliable (and our heuristic will likely mark non-HTML files * as harmless anyway). */ const dummy_script = `<script>null</script>`; const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0]; decoded = doctype_decl + dummy_script + decoded.substring(doctype_decl.length); } properties.filter.write(properties.encoder.encode(decoded)); if (properties.decoder.encoding === "utf-8") properties.filter.disconnect(); } function apply_stream_filter(details, headers, policy) { if (!policy.payload) return headers; const properties = properties_from_headers(headers); properties.filter = browser.webRequest.filterResponseData(details.requestId); properties.filter.ondata = event => filter_data(properties, event); properties.filter.onstop = () => properties.filter.close(); /* * In the future we might consider modifying the headers that specify * encoding. For now we are not yet doing it, though. However, we * prepend the data with UTF-8 BOM which should be enough. */ return headers; } /* * EXPORTS_START * EXPORT apply_stream_filter * EXPORTS_END */