/**
 * Hachette modify HTML document as it loads and reconstruct HTML code from it
 *
 * Copyright (C) 2021 Wojtek Kosior
 * Redistribution terms are gathered in the `copyright' file.
 */

/*
 * IMPORTS_START
 * IMPORT gen_nonce
 * IMPORT csp_rule
 * IMPORT is_csp_header_name
 * IMPORT sanitize_csp_header
 * IMPORT sanitize_attributes
 * IMPORTS_END
 */

/*
 * Functions that sanitize elements. The script blocking measures are, when
 * possible, going to be applied together with CSP rules injected using
 * webRequest.
 */

const blocked = "blocked";

function block_attribute(node, attr)
{
    /*
     * Disabling attributed this way allows them to still be relatively
     * easily accessed in case they contain some useful data.
     */

    const construct_name = [attr];
    while (node.hasAttribute(construct_name.join("")))
	construct_name.unshift(blocked);

    while (construct_name.length > 1) {
	construct_name.shift();
	const name = construct_name.join("");
	node.setAttribute(`${blocked}-${name}`, node.getAttribute(name));
    }

    node.removeAttribute(attr);
}

function sanitize_script(script, data)
{
    if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
	script.remove();
	script.hachette_deleted = true;
	script.hachette_ignore = true;
    }

    if (data.policy.allow)
	return;

    block_attribute(script, "type");
    script.setAttribute("type", "application/json");
}

function inject_csp(head, data)
{
    if (data.policy.allow)
	return;

    const meta = document.createElement("meta");
    meta.setAttribute("http-equiv", "Content-Security-Policy");
    meta.setAttribute("content", csp_rule(data.policy.nonce));
    meta.hachette_ignore = true;
    head.prepend(meta);

    data.new_added.unshift([meta, head]);
}

function sanitize_http_equiv_csp_rule(meta, data)
{
    const http_equiv = meta.getAttribute("http-equiv");
    const value = meta.content;

    if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
	return;

    block_attribute(meta, "content");

    if (data.policy.allow || is_csp_header_name(http_equiv, false))
	meta.content = sanitize_csp_header({value}, data.policy).value;
}

function sanitize_node(node, data)
{
    if (node.tagName === "SCRIPT")
	sanitize_script(node, data);

    if (node.tagName === "HEAD")
	inject_csp(node, data);

    if (node.tagName === "META")
	sanitize_http_equiv_csp_rule(node, data);

    if (!data.policy.allow)
	sanitize_attributes(node, data);
}

/*
 * Instead of calling writer directly with multiple small chunks of reconstruced
 * HTML code, we utilize `setTimeout()' to only have it called once,
 * asynchronously.
 */
function do_write_callback(data)
{
    data.writer(data.chunks.join(""));
    data.chunks = [];

    if (data.finished && data.finisher)
	data.finisher();
}

function do_write(chunk, data)
{
    data.chunks.push(chunk);
    clearTimeout(data.write_timeout);
    data.write_timeout = setTimeout(() => do_write_callback(data), 0);
}

const serializer = new XMLSerializer();

function start_serializing_node(node, data)
{
    node.hachette_started = true;

    if (!data.writer)
	return;

    const clone = node.cloneNode(false);
    clone.textContent = data.uniq;
    do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
}

function finish_serializing_node(node, data)
{
    const nodes_to_process = [node];

    while (true) {
	node = nodes_to_process.pop();
	if (!node)
	    break;

	nodes_to_process.push(node, node.hachette_last_added);
    }

    while (nodes_to_process.length > 0) {
	const node = nodes_to_process.pop();
	node.remove();
	node.hachette_ignore = true;

	if (!data.writer)
	    continue;

	if (node.hachette_started) {
	    node.textContent = data.uniq;
	    do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
	    continue;
	}

	do_write(node.outerHTML || serializer.serializeToString(node), data);
    }
}

function process_initial_nodes(node, data)
{
    if (data.processed_initial_nodes)
	return;

    data.processed_initial_nodes = true;

    start_serializing_node(data.html_root, data);

    const new_added = [];
    const nodes_to_process = [data.html_root];

    let i = 0;
    while (nodes_to_process.length > 0) {
	let current = nodes_to_process.shift();

	if (current.firstChild) {
	    if (current.firstChild === node)
		break;
	    nodes_to_process.unshift(current.firstChild, current);
	    new_added.push([current.firstChild, current]);
	    continue;
	}

	while (current && !current.nextSibling)
	    current = nodes_to_process.shift();

	if (!current || current.nextSibling === node)
	    break;

	nodes_to_process.unshift(current.nextSibling);
	new_added.push([current.nextSibling, nodes_to_process[1]]);
    }

    data.new_added.unshift(...new_added);
}

/*
 * Important! Due to some weirdness node.parentElement is not alway correct
 * in MutationRecords under Chromium. Track node relations manually.
 */
function handle_added_node(node, true_parent, data)
{
    /*
     * Functions we call here might cause new nodes to be injected or found
     * that require processing before the one we got in function argument.
     * We rely on those functions putting the node(s) they create/find at the
     * very beginning of the `new_added' queue and (for created nodes) setting
     * their `hachette_ignore' property, based on which their MutationRecord
     * will not be processed. A function can also mark a node already in the
     * `new_added' queue as not eligible for processing by setting its
     * `hachette_deleted' property.
     */

    process_initial_nodes(node, data);

    data.new_added.push([node, true_parent]);

    while (data.new_added.length > 0) {
	[node, true_parent] = data.new_added.shift();

	if (true_parent.hachette_deleted)
	    node.hachette_deleted = true;
	if (node.hachette_deleted)
	    continue;

	if (!true_parent.hachette_started)
	    start_serializing_node(true_parent, data)

	if (!node.hachette_ignore)
	    sanitize_node(node, data);

	if (node.hachette_deleted)
	    continue;

	if (data.node_eater)
	    data.node_eater(node, true_parent);

	finish_serializing_node(true_parent.hachette_last_added, data);

	true_parent.hachette_last_added = node;
    }
}

function handle_mutation(mutations, data)
{
    /*
     * Chromium: for an unknown reason mutation.target is not always the same as
     * node.parentElement. The former is the correct one.
     */
    for (const mutation of mutations) {
	for (const node of mutation.addedNodes) {
	    /* Check for nodes added by ourselves. */
	    if (mutation.target.hachette_ignore)
		node.hachette_ignore = true;
	    if (node.hachette_ignore)
		continue;

	    handle_added_node(node, mutation.target, data);
	}
    }
}

function finish_processing(data)
{
    process_initial_nodes(undefined, data);

    /*
     * The `finisher' callback should be called, if provided. Normally our
     * function that performs the last write does it after seeing `finished'
     * set to `true'. If, however, there's no `writer' callback and hence no
     * writes to perform, we need to take care of calling `finisher' here.
     */
    data.finished = true;
    handle_mutation(data.observer.takeRecords(), data);
    data.observer.disconnect();

    /*
     * Additional whitespace that was after `</body>' gets appended to body.
     * Although it's a minor issue, it is not what we want. There's no way to
     * tell exactly what part of that whitespace was after `</body>' and what
     * was before, so we just replace it with a single newline which looks good
     * when printed.
     */
    const body = data.html_root.lastChild;
    const text = body && body.tagName === "BODY" && body.lastChild;
    if (text && text.nodeName === "#text") {
	const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
	text.textContent = new_content + "\n";
    }

    finish_serializing_node(data.html_root, data);
    if (!data.writer && data.finisher)
	setTimeout(data.finisher, 0);
}

/*
 * This function sanitizes `html_root' according to `policy'. It is capable of
 * working on an HTML document that is being written to, sanitizing new nodes
 * as they appear.
 *
 * `consumers' object may contain 3 optional callback functions: `writer',
 * `node_eater' and `finisher'. The first one, if present, is called with chunks
 * of reconstructed HTML code. The second one, if present, gets called for every
 * added node with 2 arguments: that node and its parent. The third one is
 * called at the end, after all processing has been done.
 *
 * `modify_on_the_fly()' returns a callback that should be called (with no
 * arguments) once the document of html_root has finished being written to.
 * Unfortunately, due to specifics behavior of document that has had its
 * documentElement replaced
 */
function modify_on_the_fly(html_root, policy, consumers)
{
    const uniq = gen_nonce();
    const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
    const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
    Object.assign(data, consumers);

    var observer = new MutationObserver(m => handle_mutation(m, data));
    observer.observe(data.html_root, {
     	attributes: true,
	childList: true,
	subtree: true
    });

    data.observer = observer;

    return () => finish_processing(data);
}

/*
 * EXPORTS_START
 * EXPORT modify_on_the_fly
 * EXPORTS_END
 */