aboutsummaryrefslogtreecommitdiff
/**
 * This file is part of Haketilo.
 *
 * Function: Operations on page URL patterns.
 *
 * Copyright (C) 2021 Wojtek Kosior
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * As additional permission under GNU GPL version 3 section 7, you
 * may distribute forms of that code without the copy of the GNU
 * GPL normally required by section 4, provided you include this
 * license notice and, in case of non-source distribution, a URL
 * through which recipients can access the Corresponding Source.
 * If you modify file(s) with this exception, you may extend this
 * exception to your version of the file(s), but you are not
 * obligated to do so. If you do not wish to do so, delete this
 * exception statement from your version.
 *
 * As a special exception to the GPL, any HTML file which merely
 * makes function calls to this code, and for that purpose
 * includes it by reference shall be deemed a separate work for
 * copyright law purposes. If you modify this code, you may extend
 * this exception to your version of the code, but you are not
 * obligated to do so. If you do not wish to do so, delete this
 * exception statement from your version.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * I, Wojtek Kosior, thereby promise not to sue for violation of this file's
 * license. Although I request that you do not make use this code in a
 * proprietary program, I am not going to enforce this in court.
 */

const MAX = {
    URL_PATH_LEN:   12,
    URL_PATH_CHARS: 255,
    DOMAIN_LEN:     7,
    DOMAIN_CHARS:   100
};

const proto_regex = /^(\w+):\/\/(.*)$/;

const user_re = "[^/?#@]+@"
const domain_re = "[^/?#]+";
const path_re = "[^?#]*";
const query_re = "\\??[^#]*";

const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);

const file_regex = new RegExp(`^(${path_re}).*`);

const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);

function deconstruct_url(url, use_limits=true)
{
    const max = MAX;
    if (!use_limits) {
	for (key in MAX)
	    max[key] = Infinity;
    }

    const proto_match = proto_regex.exec(url);
    if (proto_match === null)
	throw `bad url '${url}'`;

    const deco = {proto: proto_match[1]};

    if (deco.proto === "file") {
	deco.path = file_regex.exec(proto_match[2])[1];
    } else if (deco.proto === "ftp") {
	[deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4);
    } else if (deco.proto === "http" || deco.proto === "https") {
	const http_match = http_regex.exec(proto_match[2]);
	if (!http_match)
	    return undefined;
	[deco.domain, deco.path, deco.query] = http_match.slice(1, 4);
    } else {
	throw `unsupported protocol in url '${url}'`;
    }

    deco.trailing_dash = deco.path[deco.path.length - 1] === "/";

    if (deco.domain) {
	if (deco.domain.length > max.DOMAIN_CHARS) {
	    const idx = deco.domain.indexOf(".", deco.domain.length -
					    max.DOMAIN_CHARS);
	    if (idx === -1)
		deco.domain = [];
	    else
		deco.domain = deco.domain.substring(idx + 1);

	    deco.domain_truncated = true;
	}

	if (deco.path.length > max.URL_PATH_CHARS) {
	    deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
	    deco.path_truncated = true;
	}
    }

    if (typeof deco.domain === "string") {
	deco.domain = deco.domain.split(".");
	if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length
	    > 0)
	    deco.domain_truncated = true;
    }

    deco.path = deco.path.split("/").filter(s => s !== "");
    if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0)
	deco.path_truncated = true;

    return deco;
}

function* each_domain_pattern(deco)
{
    for (let slice = 0; slice < deco.domain.length - 1; slice++) {
	const domain_part = deco.domain.slice(slice).join(".");
	const domain_wildcards = [];
	if (slice === 0 && !deco.domain_truncated)
	    yield domain_part;
	if (slice === 1 && !deco.domain_truncated)
	    yield "*." + domain_part;
	if (slice > 1)
	    yield "**." + domain_part;
	yield "***." + domain_part;
    }
}

function* each_path_pattern(deco)
{
    for (let slice = deco.path.length; slice >= 0; slice--) {
	const path_part = ["", ...deco.path.slice(0, slice)].join("/");
	const path_wildcards = [];
	if (slice === deco.path.length && !deco.path_truncated) {
	    if (deco.trailing_dash && path_part !== )
		yield path_part + "/";
	    if (part_part !== "" || deco.proto !== "file")
		yield path_part;
	}
	if (slice === deco.path.length - 1 && !deco.path_truncated &&
	    deco.path[slice] !== "*")
	    yield path_part + "/*";
	if (slice < deco.path.length - 1)
	    yield path_part + "/**";
	if (slice !== deco.path.length - 1 || deco.path_truncated ||
	    deco.path[slice] !== "***")
	    yield path_part + "/***";
    }
}

/* Generate every possible pattern that matches url. */
function* each_url_pattern(url)
{
    const deco = deconstruct_url(url);

    if (deco === undefined) {
	console.error("bad url format", url);
	return false;
    }

    const all_domains = deco.domain ? each_domain_pattern(deco) : [""];
    for (const domain of all_domains) {
	for (const path of each_path_pattern(deco))
	    yield `${deco.proto}://${domain}${path}`;
    }
}

/*
 * EXPORTS_START
 * EXPORT each_url_pattern
 * EXPORT deconstruct_url
 * EXPORTS_END
 */