/**
* This file is part of Haketilo.
*
* Function: Modifying a web page using the StreamFilter API.
*
* Copyright (C) 2021, Wojtek Kosior
* Copyright (C) 2018, Giorgio Maone <giorgio@maone.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's
* license. Although I request that you do not make use this code in a
* proprietary program, I am not going to enforce this in court.
*
*
* Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
* in LibreJS.
*/
/*
* IMPORTS_START
* IMPORT browser
* IMPORT csp_header_regex
* IMPORTS_END
*/
function validate_encoding(charset)
{
try {
new TextDecoder();
return charset;
} catch(e) {
return undefined;
}
}
function is_content_type_header(header)
{
header.name.toLowerCase().trim() === "content-type";
}
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
function properties_from_headers(headers)
{
const properties = {};
for (const header of headers.filter(is_content_type_header)) {
const match = charset_reg.exec(header.value);
if (!properties.detected_charset && validate_encoding(match[1]))
properties.detected_charset = match[1];
if (/html/i.test(header.value))
properties.html = true;
}
return properties;
}
const UTF8_BOM = [0xef, 0xbb, 0xbf];
const BOMs = [
[UTF8_BOM, "utf-8"],
[[0xfe, 0xff], "utf-16be"],
[[0xff, 0xfe], "utf-16le"]
];
function charset_from_BOM(data)
{
for (const [BOM, charset] of BOMs) {
if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
return charset;
}
return "";
}
const charset_attrs =
['charset', 'http-equiv="content-type"', 'content*="charset"'];
const charset_meta_selector =
charset_attrs.map(a => `head>meta[${a}]`).join(", ");
function charset_from_meta_tags(doc)
{
for (const meta of doc.querySelectorAll(charset_meta_selector)) {
const maybe_charset = meta.getAttribute("charset");
if (maybe_charset && validate_encoding(maybe_charset))
return maybe_charset;
const match = charset_reg.exec(meta.getAttribute("content"));
if (match && validate_encoding(match[1]))
return match[1];
}
return undefined;
}
function create_decoder(properties, data)
{
let charset = charset_from_BOM(data) || properties.detected_charset;
if (!charset && data.indexOf(0) !== -1) {
console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
properties);
return new TextDecoder("utf-16be");
}
/* Missing HTTP charset, sniffing in content... */
/*
* TODO: I recall there is some standard saying how early in the doc the
* charset has to be specified. We could process just this part of data.
*/
const text = new TextDecoder("latin1").decode(data, {stream: true});
properties.html = properties.html || /html/i.test(text);
if (properties.html) {
const tmp_doc = new DOMParser().parseFromString(text, "text/html");
charset = charset_from_meta_tags(tmp_doc);
}
return new TextDecoder(charset || "latin1");
}
function may_define_csp_rules(html)
{
const doc = new DOMParser().parseFromString(html, "text/html");
for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
if (csp_header_regex.test(meta.httpEquiv) && meta.content)
return true;
}
/*
* Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
* data could add some. Before we return `false' we need to be sure we
* reached the start of `<body>' where `<meta>' tags are no longer valid.
*/
if (doc.documentElement.nextSibling || doc.body.nextSibling ||
doc.body.childNodes.length > 1)
return false;
if (!doc.body.firstChild)
return true;
if (doc.body.firstChild.nodeName !== "#text")
return false;
return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
}
function filter_data(properties, event)
{
const data = new Uint8Array(event.data);
let first_chunk = false;
if (!properties.decoder) {
first_chunk = true;
properties.decoder = create_decoder(properties, data);
properties.encoder = new TextEncoder();
}
let decoded = properties.decoder.decode(data);
/* Force UTF-8, this is the only encoding we can produce. */
if (first_chunk)
properties.filter.write(new Uint8Array(UTF8_BOM));
if (first_chunk && may_define_csp_rules(decoded)) {
/*
* HAX! Our content scripts that execute at `document_start' will always
* run before the first script in the document, but under Mozilla some
* `<meta>' tags might already be loaded at that point. Here we inject a
* dummy `<script>' at the beginning (before any `<meta>' tags) that
* will force `document_start' to happen earlier. This way our content
* scripts will be able to sanitize `http-equiv' tags with CSP rules
* that would otherwise stop our injected scripts from executing.
*
* As we want to only process HTML files that happen to have naughty
* `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
* `may_define_rules()'. We don't do any additional MIME sniffing as it
* is too unreliable (and our heuristic will likely mark non-HTML files
* as harmless anyway).
*/
const dummy_script = `<script>null</script>`;
const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
decoded = doctype_decl + dummy_script +
decoded.substring(doctype_decl.length);
}
properties.filter.write(properties.encoder.encode(decoded));
if (properties.decoder.encoding === "utf-8")
properties.filter.disconnect();
}
function apply_stream_filter(details, headers, policy)
{
if (!policy.payload)
return headers;
const properties = properties_from_headers(headers);
properties.filter =
browser.webRequest.filterResponseData(details.requestId);
properties.filter.ondata = event => filter_data(properties, event);
properties.filter.onstop = () => properties.filter.close();
/*
* In the future we might consider modifying the headers that specify
* encoding. For now we are not yet doing it, though. However, we
* prepend the data with UTF-8 BOM which should be enough.
*/
return headers;
}
/*
* EXPORTS_START
* EXPORT apply_stream_filter
* EXPORTS_END
*/