aboutsummaryrefslogtreecommitdiff
path: root/common/patterns.js
blob: 054e6109b8eb5be56bad76993117aebc4aac61ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/**
 * This file is part of Haketilo.
 *
 * Function: Operations on page URL patterns.
 *
 * Copyright (C) 2021 Wojtek Kosior
 * Redistribution terms are gathered in the `copyright' file.
 */

const MAX = {
    URL_PATH_LEN:   12,
    URL_PATH_CHARS: 255,
    DOMAIN_LEN:     7,
    DOMAIN_CHARS:   100
};

const proto_regex = /^(\w+):\/\/(.*)$/;

const user_re = "[^/?#@]+@"
const domain_re = "[.*a-zA-Z0-9-]+";
const path_re = "[^?#]*";
const query_re = "\\??[^#]*";

const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);

const file_regex = new RegExp(`^(/${path_re}).*`);

const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);

function match_or_throw(regex, string, error_msg)
{
    const match = regex.exec(string);
    if (match === null)
	throw error_msg;

    return match;
}

function deconstruct_url(url, use_limits=true)
{
    const max = MAX;
    if (!use_limits) {
	for (key in MAX)
	    max[key] = Infinity;
    }

    const matcher = (re, str) => match_or_throw(re, str, `bad url '${url}'`)

    const proto_match = matcher(proto_regex, url);
    const deco = {proto: proto_match[1]};

    if (deco.proto === "file") {
	deco.path = matcher(file_regex, proto_match[2])[1];
    } else if (deco.proto === "ftp") {
	[deco.domain, deco.path] =
	    matcher(ftp_regex, proto_match[2]).slice(2, 4);
    } else if (deco.proto === "http" || deco.proto === "https") {
	[deco.domain, deco.path, deco.query] =
	    matcher(http_regex, proto_match[2]).slice(1, 4);
	deco.domain = deco.domain.toLowerCase();
    } else {
	throw `unsupported protocol in url '${url}'`;
    }

    deco.trailing_dash = deco.path[deco.path.length - 1] === "/";

    if (deco.domain) {
	if (deco.domain.length > max.DOMAIN_CHARS) {
	    const idx = deco.domain.indexOf(".", deco.domain.length -
					    max.DOMAIN_CHARS);
	    if (idx === -1)
		deco.domain = [];
	    else
		deco.domain = deco.domain.substring(idx + 1);

	    deco.domain_truncated = true;
	}

	if (deco.path.length > max.URL_PATH_CHARS) {
	    deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
	    deco.path_truncated = true;
	}
    }

    if (typeof deco.domain === "string") {
	deco.domain = deco.domain.split(".");
	if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length
	    > 0)
	    deco.domain_truncated = true;
    }

    deco.path = deco.path.split("/").filter(s => s !== "");
    if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0)
	deco.path_truncated = true;

    return deco;
}

function* each_domain_pattern(deco)
{
    for (let slice = 0; slice < deco.domain.length - 1; slice++) {
	const domain_part = deco.domain.slice(slice).join(".");
	const domain_wildcards = [];
	if (slice === 0 && !deco.domain_truncated)
	    yield domain_part;
	if (slice === 1 && !deco.domain_truncated)
	    yield "*." + domain_part;
	if (slice > 1)
	    yield "**." + domain_part;
	yield "***." + domain_part;
    }
}

function* each_path_pattern(deco)
{
    for (let slice = deco.path.length; slice >= 0; slice--) {
	const path_part = ["", ...deco.path.slice(0, slice)].join("/");
	const path_wildcards = [];
	if (slice === deco.path.length && !deco.path_truncated) {
	    if (deco.trailing_dash)
		yield path_part + "/";
	    if (slice > 0 || deco.proto !== "file")
		yield path_part;
	}
	if (slice === deco.path.length - 1 && !deco.path_truncated &&
	    deco.path[slice] !== "*")
	    yield path_part + "/*";
	if (slice < deco.path.length - 1)
	    yield path_part + "/**";
	if (slice !== deco.path.length - 1 || deco.path_truncated ||
	    deco.path[slice] !== "***")
	    yield path_part + "/***";
    }
}

/* Generate every possible pattern that matches url. */
function* each_url_pattern(url)
{
    const deco = deconstruct_url(url);

    if (deco === undefined) {
	console.error("bad url format", url);
	return false;
    }

    const all_domains = deco.domain ? each_domain_pattern(deco) : [""];
    for (const domain of all_domains) {
	for (const path of each_path_pattern(deco))
	    yield `${deco.proto}://${domain}${path}`;
    }
}

/*
 * EXPORTS_START
 * EXPORT each_url_pattern
 * EXPORT deconstruct_url
 * EXPORTS_END
 */