aboutsummaryrefslogtreecommitdiff
/**
 * This file is part of Haketilo.
 *
 * Function: Modifying a web page using the StreamFilter API.
 *
 * Copyright (C) 2021, Wojtek Kosior
 * Copyright (C) 2018, Giorgio Maone <giorgio@maone.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * I, Wojtek Kosior, thereby promise not to sue for violation of this file's
 * license. Although I request that you do not make use of this code in a
 * proprietary program, I am not going to enforce this in court.
 *
 *
 * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
 * in LibreJS.
 */

#FROM common/browser.js IMPORT browser
#FROM common/misc.js    IMPORT csp_header_regex

function validate_encoding(charset)
{
    try {
	new TextDecoder(charset);
	return charset;
    } catch(e) {
	return undefined;
    }
}

function is_content_type_header(header)
{
    return header.name.toLowerCase().trim() === "content-type";
}

const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;

function properties_from_headers(headers)
{
    const properties = {};

    for (const header of headers.filter(is_content_type_header)) {
	const match = charset_reg.exec(header.value);
	if (match && !properties.detected_charset &&
	    validate_encoding(match[1]))
	    properties.detected_charset = match[1];

	if (/html/i.test(header.value))
	    properties.html = true;
    }

    return properties;
}

const UTF8_BOM = [0xef, 0xbb, 0xbf];
const BOMs = [
    [UTF8_BOM, "utf-8"],
    [[0xfe, 0xff], "utf-16be"],
    [[0xff, 0xfe], "utf-16le"]
];

function charset_from_BOM(data)
{
    for (const [BOM, charset] of BOMs) {
	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
	    return charset;
    }

    return "";
}

const charset_attrs =
      ['charset', 'http-equiv="content-type"', 'content*="charset"'];
const charset_meta_selector =
      charset_attrs.map(a => `head>meta[${a}]`).join(", ");

function charset_from_meta_tags(doc)
{
    for (const meta of doc.querySelectorAll(charset_meta_selector)) {
	const maybe_charset = meta.getAttribute("charset");
	if (maybe_charset && validate_encoding(maybe_charset))
	    return maybe_charset;

        const match = charset_reg.exec(meta.getAttribute("content"));
        if (match && validate_encoding(match[1]))
	    return match[1];
    }

    return undefined;
}

function create_decoder(properties, data)
{
    let charset = charset_from_BOM(data) || properties.detected_charset;

    if (charset)
	return new TextDecoder(charset);

    if (data.indexOf(0) !== -1) {
        console.warn("Haketilo: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
		     properties);
	return new TextDecoder("utf-16be");
    }

    /* Missing HTTP charset, sniffing in content... */
    /*
     * TODO: I recall there is some standard saying how early in the doc the
     * charset has to be specified. We could process just this part of data.
     */
    const text = new TextDecoder("latin1").decode(data, {stream: true});
    properties.html = properties.html || /html/i.test(text);

    if (properties.html) {
	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
	charset = charset_from_meta_tags(tmp_doc);
    }

    return new TextDecoder(charset || "latin1");
}

function may_define_csp_rules(html)
{
    const doc = new DOMParser().parseFromString(html, "text/html");

    for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
	if (csp_header_regex.test(meta.httpEquiv) && meta.content)
	    return true;
    }

    /*
     * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
     * data could add some. Before we return `false' we need to be sure we
     * reached the start of `<body>' where `<meta>' tags are no longer valid.
     */

    if (doc.documentElement.nextSibling || doc.body.nextSibling ||
	doc.body.childNodes.length > 1)
	return false;

    if (!doc.body.firstChild)
	return true;

    if (doc.body.firstChild.nodeName !== "#text")
	return false;

    return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
}

function filter_data(properties, event)
{
    const data = new Uint8Array(event.data);
    let first_chunk = false;
    if (!properties.decoder) {
	first_chunk = true;
	properties.decoder = create_decoder(properties, data);
	properties.encoder = new TextEncoder();
    }

    let decoded = properties.decoder.decode(data);

    /* Force UTF-8, this is the only encoding we can produce. */
    if (first_chunk)
	properties.filter.write(new Uint8Array(UTF8_BOM));

    if (first_chunk && may_define_csp_rules(decoded)) {
	/*
	 * HAX! Our content scripts that execute at `document_start' will always
	 * run before the first script in the document, but under Mozilla some
	 * `<meta>' tags might already be loaded at that point. Here we inject a
	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
	 * will force `document_start' to happen earlier. This way our content
	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
	 * that would otherwise stop our injected scripts from executing.
	 *
	 * As we want to only process HTML files that happen to have naughty
	 * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
	 * `may_define_rules()'. We don't do any additional MIME sniffing as it
	 * is too unreliable (and our heuristic will likely mark non-HTML files
	 * as harmless anyway).
	 */

	const dummy_script = `<script>null</script>`;
	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
	decoded = doctype_decl + dummy_script +
	    decoded.substring(doctype_decl.length);
    }

    properties.filter.write(properties.encoder.encode(decoded));

    if (properties.decoder.encoding === "utf-8")
	properties.filter.disconnect();
}

function apply(details, headers, policy)
{
    if (!policy.payload)
	return headers;

    const properties = properties_from_headers(headers);

    properties.filter =
	browser.webRequest.filterResponseData(details.requestId);

    properties.filter.ondata = event => filter_data(properties, event);
    properties.filter.onstop = () => properties.filter.close();

    /*
     * In the future we might consider modifying the headers that specify
     * encoding. For now we are not yet doing it, though. However, we
     * prepend the data with UTF-8 BOM which should be enough.
     */
    return headers;
}
#EXPORT apply