summaryrefslogtreecommitdiff
path: root/common/patterns.js
blob: ebb55abb8485df5ed684fc79d09b6e9501e12dff (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/**
 * Hachette operations on page url patterns
 *
 * Copyright (C) 2021 Wojtek Kosior
 * Redistribution terms are gathered in the `copyright' file.
 */

const proto_regex = /^(\w+):\/\/(.*)$/;

const user_re = "[^/?#@]+@"
const domain_re = "[^/?#]+";
const path_re = "[^?#]*";
const query_re = "\\??[^#]*";

const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);

const file_regex = new RegExp(`^(${path_re}).*`);

const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);

function deconstruct_url(url)
{
    const proto_match = proto_regex.exec(url);
    if (proto_match === null)
	return undefined;

    const deco = {proto: proto_match[1]};

    if (deco.proto === "file") {
	deco.path = file_regex.exec(proto_match[2])[1];
    } else if (deco.proto === "ftp") {
	[deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4);
    } else {
	const http_match = http_regex.exec(proto_match[2]);
	if (!http_match)
	    return undefined;
	[deco.domain, deco.path, deco.query] = http_match.slice(1, 4);
    }

    if (deco.domain)
	deco.domain = deco.domain.split(".");

    const leading_dash = deco.path[0] === "/";
    deco.trailing_dash = deco.path[deco.path.length - 1] === "/";
    deco.path = deco.path.split("/").filter(s => s !== "");
    if (leading_dash || deco.path.length === 0)
	deco.path.unshift("");

    return deco;
}

/* Be sane: both arguments should be arrays of length >= 2 */
function domain_matches(url_domain, pattern_domain)
{
    const length_difference = url_domain.length - pattern_domain.length;

    for (let i = 1; i <= url_domain.length; i++) {
	const url_part = url_domain[url_domain.length - i];
	const pattern_part = pattern_domain[pattern_domain.length - i];

	if (pattern_domain.length === i) {
	    if (pattern_part === "*")
		return length_difference === 0;
	    if (pattern_part === "**")
		return length_difference > 0;
	    if (pattern_part === "***")
		return true;
	    return length_difference === 0 && pattern_part === url_part;
	}

	if (pattern_part !== url_part)
	    return false;
    }

    return pattern_domain.length === url_domain.length + 1 &&
	pattern_domain[0] === "***";
}

function path_matches(url_path, url_trailing_dash,
		      pattern_path, pattern_trailing_dash)
{
    const dashes_ok = !(pattern_trailing_dash && !url_trailing_dash);

    if (pattern_path.length === 0)
	return url_path.length === 0 && dashes_ok;

    const length_difference = url_path.length - pattern_path.length;

    for (let i = 0; i < url_path.length; i++) {
	if (pattern_path.length === i + 1) {
	    if (pattern_path[i] === "*")
		return length_difference === 0;
	    if (pattern_path[i] === "**") {
		return length_difference > 0 ||
		    (url_path[i] === "**" && dashes_ok);
	    }
	    if (pattern_path[i] === "***")
		return length_difference >= 0;
	    return length_difference === 0 &&
		pattern_path[i] === url_path[i] && dashes_ok;
	}

	if (pattern_path[i] !== url_path[i])
	    return false;
    }

    return false;
}

function url_matches(url, pattern)
{
    const url_deco = deconstruct_url(url);
    const pattern_deco = deconstruct_url(pattern);

    if (url_deco === undefined || pattern_deco === undefined) {
	console.log(`bad comparison: ${url} and ${pattern}`);
	return false
    }

    return pattern_deco.proto === url_deco.proto &&
	!(pattern_deco.proto === "file" && pattern_deco.trailing_dash) &&
	!!url_deco.domain === !!pattern_deco.domain &&
	(!url_deco.domain ||
	 domain_matches(url_deco.domain, pattern_deco.domain)) &&
	path_matches(url_deco.path, url_deco.trailing_dash,
		     pattern_deco.path, pattern_deco.trailing_dash);
}

function* each_domain_pattern(domain_segments)
{
    for (let slice = 0; slice < domain_segments.length; slice++) {
	const domain_part = domain_segments.slice(slice).join(".");
	const domain_wildcards = [];
	if (slice === 0)
	    yield domain_part;
	if (slice === 1)
	    yield "*." + domain_part;
	if (slice > 1)
	    yield "**." + domain_part;
	yield "***." + domain_part;
    }
}

function* each_path_pattern(path_segments, trailing_dash)
{
    for (let slice = path_segments.length; slice > 0; slice--) {
	const path_part = path_segments.slice(0, slice).join("/");
	const path_wildcards = [];
	if (slice === path_segments.length) {
	    if (trailing_dash)
		yield path_part + "/";
	    yield path_part;
	}
	if (slice === path_segments.length - 1 && path_segments[slice] !== "*")
	    yield path_part + "/*";
	if (slice < path_segments.length - 1)
	    yield path_part + "/**";
	if (slice < path_segments.length - 1 ||
	    path_segments[path_segments.length - 1] !== "***")
	    yield path_part + "/***";
    }
}

/* Generate every possible pattern that matches url. */
function* each_url_pattern(url)
{
    const deco = deconstruct_url(url);

    if (deco === undefined) {
	console.log("bad url format", url);
	return false;
    }

    const all_domains = deco.domain ? each_domain_pattern(deco.domain) : [""];
    for (const domain of all_domains) {
	for (const path of each_path_pattern(deco.path, deco.trailing_dash))
	    yield `${deco.proto}://${domain}${path}`;
    }
}

/*
 * EXPORTS_START
 * EXPORT url_matches
 * EXPORT each_url_pattern
 * EXPORTS_END
 */