/** * Hachette modifying a web page using the StreamFilter API * * Copyright (C) 2018 Giorgio Maone * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. * * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js' * in LibreJS. */ /* * IMPORTS_START * IMPORT browser * IMPORTS_END */ function validate_encoding(charset) { try { new TextDecoder(); return charset; } catch(e) { return undefined; } } function is_content_type_header(header) { header.name.toLowerCase().trim() === "content-type"; } const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i; function properties_from_headers(headers) { const properties = {}; for (const header of headers.filter(is_content_type_header)) { const match = charset_reg.exec(header.value); if (!properties.detected_charset && validate_encoding(match[1])) properties.detected_charset = match[1]; if (/html/i.test(header.value)) properties.html = true; } return properties; } const UTF8_BOM = [0xef, 0xbb, 0xbf]; const BOMs = [ [UTF8_BOM, "utf-8"], [[0xfe, 0xff], "utf-16be"], [[0xff, 0xfe], "utf-16le"] ]; function charset_from_BOM(data) { for (const [BOM, charset] of BOMs) { if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true)) return charset; } return ""; } const charset_attrs = ['charset', 'http-equiv="content-type"', 'content*="charset"']; const charset_meta_selector = charset_attrs.map(a => `head>meta[${a}]`).join(", "); function charset_from_meta_tags(doc) { for (const meta of doc.querySelectorAll(charset_meta_selector)) { const maybe_charset = meta.getAttribute("charset"); if (maybe_charset && validate_encoding(maybe_charset)) return maybe_charset; const match = charset_reg.exec(meta.getAttribute("content")); if (match && validate_encoding(match[1])) return match[1]; } return undefined; } function create_decoder(properties, data) { let charset = charset_from_BOM(data) || properties.detected_charset; if (!charset && data.indexOf(0) !== -1) { console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.", properties); return new TextDecoder("utf-16be"); } /* Missing HTTP charset, sniffing in content... */ /* * TODO: I recall there is some standard saying how early in the doc the * charset has to be specified. We could process just this part of data. */ const text = new TextDecoder("latin1").decode(data, {stream: true}); properties.html = properties.html || /html/i.test(text); if (properties.html) { const tmp_doc = new DOMParser().parseFromString(text, "text/html"); charset = charset_from_meta_tags(tmp_doc); } return new TextDecoder(charset || "latin1"); } function filter_data(properties, event) { const data = new Uint8Array(event.data); let first_chunk = false; if (!properties.decoder) { first_chunk = true; properties.decoder = create_decoder(properties, data); properties.encoder = new TextEncoder(); /* Force UTF-8, this is the only encoding we can produce. */ properties.filter.write(new Uint8Array(UTF8_BOM)); } let decoded = properties.decoder.decode(data); if (first_chunk) { /* * HAX! Our content scripts that execute at `document_start' will always * run before the first script in the document, but under Mozilla some * `' tags might already be loaded at that point. Here we inject a * dummy ``; const doctype_decl = /^(\s*"']*>)?/i.exec(decoded)[0]; decoded = doctype_decl + dummy_script + decoded.substring(doctype_decl.length); } properties.filter.write(properties.encoder.encode(decoded)); if (properties.decoder.encoding === "utf-8") properties.filter.disconnect(); } function apply_stream_filter(details, headers, policy) { if (policy.allow) return headers; const properties = properties_from_headers(headers); properties.policy = policy; properties.filter = browser.webRequest.filterResponseData(details.requestId); properties.filter.ondata = event => filter_data(properties, event); properties.filter.onstop = () => properties.filter.close(); /* * In the future we might consider modifying the headers that specify * encoding. For now we are not yet doing it, though. However, we * prepend the data with UTF-8 BOM which should be enough. */ return headers; } /* * EXPORTS_START * EXPORT apply_stream_filter * EXPORTS_END */