aboutsummaryrefslogtreecommitdiff
path: root/common/patterns.js
diff options
context:
space:
mode:
Diffstat (limited to 'common/patterns.js')
-rw-r--r--common/patterns.js260
1 files changed, 112 insertions, 148 deletions
diff --git a/common/patterns.js b/common/patterns.js
index be7c650..635b128 100644
--- a/common/patterns.js
+++ b/common/patterns.js
@@ -1,187 +1,151 @@
/**
- * Hachette operations on page url patterns
+ * This file is part of Haketilo.
+ *
+ * Function: Operations on page URL patterns.
*
* Copyright (C) 2021 Wojtek Kosior
* Redistribution terms are gathered in the `copyright' file.
*/
-const proto_re = "[a-zA-Z]*:\/\/";
-const domain_re = "[^/?#]+";
-const segments_re = "/[^?#]*";
-const query_re = "\\?[^#]*";
-
-const url_regex = new RegExp(`\
-^\
-(${proto_re})\
-(${domain_re})\
-(${segments_re})?\
-(${query_re})?\
-#?.*\$\
-`);
-
-function deconstruct_url(url)
-{
- const regex_match = url_regex.exec(url);
- if (regex_match === null)
- return undefined;
+const MAX = {
+ URL_PATH_LEN: 12,
+ URL_PATH_CHARS: 255,
+ DOMAIN_LEN: 7,
+ DOMAIN_CHARS: 100
+};
- let [_, proto, domain, path, query] = regex_match;
+const proto_regex = /^(\w+):\/\/(.*)$/;
- domain = domain.split(".");
- let path_trailing_dash =
- path && path[path.length - 1] === "/";
- path = (path || "").split("/").filter(s => s !== "");
- path.unshift("");
+const user_re = "[^/?#@]+@"
+const domain_re = "[.a-zA-Z0-9-]+";
+const path_re = "[^?#]*";
+const query_re = "\\??[^#]*";
- return {proto, domain, path, query, path_trailing_dash};
-}
+const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);
+
+const file_regex = new RegExp(`^(${path_re}).*`);
-/* Be sane: both arguments should be arrays of length >= 2 */
-function domain_matches(url_domain, pattern_domain)
+const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);
+
+function deconstruct_url(url, use_limits=true)
{
- const length_difference = url_domain.length - pattern_domain.length;
-
- for (let i = 1; i <= url_domain.length; i++) {
- const url_part = url_domain[url_domain.length - i];
- const pattern_part = pattern_domain[pattern_domain.length - i];
-
- if (pattern_domain.length === i) {
- if (pattern_part === "*")
- return length_difference === 0;
- if (pattern_part === "**")
- return length_difference > 0;
- if (pattern_part === "***")
- return true;
- return length_difference === 0 && pattern_part === url_part;
- }
+ const max = MAX;
+ if (!use_limits) {
+ for (key in MAX)
+ max[key] = Infinity;
+ }
- if (pattern_part !== url_part)
- return false;
+ const proto_match = proto_regex.exec(url);
+ if (proto_match === null)
+ throw `bad url '${url}'`;
+
+ const deco = {proto: proto_match[1]};
+
+ if (deco.proto === "file") {
+ deco.path = file_regex.exec(proto_match[2])[1];
+ } else if (deco.proto === "ftp") {
+ [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4);
+ } else if (deco.proto === "http" || deco.proto === "https") {
+ const http_match = http_regex.exec(proto_match[2]);
+ if (!http_match)
+ return undefined;
+ [deco.domain, deco.path, deco.query] = http_match.slice(1, 4);
+ deco.domain = deco.domain.toLowerCase();
+ } else {
+ throw `unsupported protocol in url '${url}'`;
}
- return pattern_domain.length === url_domain.length + 1 &&
- pattern_domain[0] === "***";
-}
+ deco.trailing_dash = deco.path[deco.path.length - 1] === "/";
-function path_matches(url_path, url_trailing_dash,
- pattern_path, pattern_trailing_dash)
-{
- const dashes_ok = !(pattern_trailing_dash && !url_trailing_dash);
-
- if (pattern_path.length === 0)
- return url_path.length === 0 && dashes_ok;
-
- const length_difference = url_path.length - pattern_path.length;
-
- for (let i = 0; i < url_path.length; i++) {
- if (pattern_path.length === i + 1) {
- if (pattern_path[i] === "*")
- return length_difference === 0;
- if (pattern_path[i] === "**") {
- return length_difference > 0 ||
- (url_path[i] === "**" && dashes_ok);
- }
- if (pattern_path[i] === "***")
- return length_difference >= 0;
- return length_difference === 0 &&
- pattern_path[i] === url_path[i] && dashes_ok;
+ if (deco.domain) {
+ if (deco.domain.length > max.DOMAIN_CHARS) {
+ const idx = deco.domain.indexOf(".", deco.domain.length -
+ max.DOMAIN_CHARS);
+ if (idx === -1)
+ deco.domain = [];
+ else
+ deco.domain = deco.domain.substring(idx + 1);
+
+ deco.domain_truncated = true;
}
- if (pattern_path[i] !== url_path[i])
- return false;
+ if (deco.path.length > max.URL_PATH_CHARS) {
+ deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
+ deco.path_truncated = true;
+ }
}
- return false;
-}
-
-function url_matches(url, pattern)
-{
- const url_deco = deconstruct_url(url);
- const pattern_deco = deconstruct_url(pattern);
-
- if (url_deco === undefined || pattern_deco === undefined) {
- console.log(`bad comparison: ${url} and ${pattern}`);
- return false
+ if (typeof deco.domain === "string") {
+ deco.domain = deco.domain.split(".");
+ if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length
+ > 0)
+ deco.domain_truncated = true;
}
- if (pattern_deco.proto !== url_deco.proto)
- return false;
+ deco.path = deco.path.split("/").filter(s => s !== "");
+ if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0)
+ deco.path_truncated = true;
- return domain_matches(url_deco.domain, pattern_deco.domain) &&
- path_matches(url_deco.path, url_deco.path_trailing_dash,
- pattern_deco.path, pattern_deco.path_trailing_dash);
+ return deco;
}
-/*
- * Call callback for every possible pattern that matches url. Return when there
- * are no more patterns or callback returns false.
- */
-function for_each_possible_pattern(url, callback)
+function* each_domain_pattern(deco)
{
- const deco = deconstruct_url(url);
-
- if (deco === undefined) {
- console.log("bad url format", url);
- return;
+ for (let slice = 0; slice < deco.domain.length - 1; slice++) {
+ const domain_part = deco.domain.slice(slice).join(".");
+ const domain_wildcards = [];
+ if (slice === 0 && !deco.domain_truncated)
+ yield domain_part;
+ if (slice === 1 && !deco.domain_truncated)
+ yield "*." + domain_part;
+ if (slice > 1)
+ yield "**." + domain_part;
+ yield "***." + domain_part;
}
+}
- for (let d_slice = 0; d_slice < deco.domain.length; d_slice++) {
- const domain_part = deco.domain.slice(d_slice).join(".");
- const domain_wildcards = [];
- if (d_slice === 0)
- domain_wildcards.push("");
- if (d_slice === 1)
- domain_wildcards.push("*.");
- if (d_slice > 0)
- domain_wildcards.push("**.");
- domain_wildcards.push("***.");
-
- for (const domain_wildcard of domain_wildcards) {
- const domain_pattern = domain_wildcard + domain_part;
-
- for (let s_slice = deco.path.length; s_slice > 0; s_slice--) {
- const path_part = deco.path.slice(0, s_slice).join("/");
- const path_wildcards = [];
- if (s_slice === deco.path.length) {
- if (deco.path_trailing_dash)
- path_wildcards.push("/");
- path_wildcards.push("");
- }
- if (s_slice === deco.path.length - 1 &&
- deco.path[s_slice] !== "*")
- path_wildcards.push("/*");
- if (s_slice < deco.path.length &&
- (deco.path[s_slice] !== "**" ||
- s_slice < deco.path.length - 1))
- path_wildcards.push("/**");
- if (deco.path[s_slice] !== "***" || s_slice < deco.path.length)
- path_wildcards.push("/***");
-
- for (const path_wildcard of path_wildcards) {
- const path_pattern = path_part + path_wildcard;
-
- const pattern = deco.proto + domain_pattern + path_pattern;
-
- if (callback(pattern) === false)
- return;
- }
- }
+function* each_path_pattern(deco)
+{
+ for (let slice = deco.path.length; slice >= 0; slice--) {
+ const path_part = ["", ...deco.path.slice(0, slice)].join("/");
+ const path_wildcards = [];
+ if (slice === deco.path.length && !deco.path_truncated) {
+ if (deco.trailing_dash)
+ yield path_part + "/";
+ if (slice > 0 || deco.proto !== "file")
+ yield path_part;
}
+ if (slice === deco.path.length - 1 && !deco.path_truncated &&
+ deco.path[slice] !== "*")
+ yield path_part + "/*";
+ if (slice < deco.path.length - 1)
+ yield path_part + "/**";
+ if (slice !== deco.path.length - 1 || deco.path_truncated ||
+ deco.path[slice] !== "***")
+ yield path_part + "/***";
}
}
-function possible_patterns(url)
+/* Generate every possible pattern that matches url. */
+function* each_url_pattern(url)
{
- const patterns = [];
- for_each_possible_pattern(url, patterns.push);
+ const deco = deconstruct_url(url);
- return patterns;
+ if (deco === undefined) {
+ console.error("bad url format", url);
+ return false;
+ }
+
+ const all_domains = deco.domain ? each_domain_pattern(deco) : [""];
+ for (const domain of all_domains) {
+ for (const path of each_path_pattern(deco))
+ yield `${deco.proto}://${domain}${path}`;
+ }
}
/*
* EXPORTS_START
- * EXPORT url_matches
- * EXPORT for_each_possible_pattern
- * EXPORT possible_patterns
+ * EXPORT each_url_pattern
+ * EXPORT deconstruct_url
* EXPORTS_END
*/