diff options
Diffstat (limited to 'common/patterns.js')
-rw-r--r-- | common/patterns.js | 260 |
1 files changed, 112 insertions, 148 deletions
diff --git a/common/patterns.js b/common/patterns.js index be7c650..635b128 100644 --- a/common/patterns.js +++ b/common/patterns.js @@ -1,187 +1,151 @@ /** - * Hachette operations on page url patterns + * This file is part of Haketilo. + * + * Function: Operations on page URL patterns. * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ -const proto_re = "[a-zA-Z]*:\/\/"; -const domain_re = "[^/?#]+"; -const segments_re = "/[^?#]*"; -const query_re = "\\?[^#]*"; - -const url_regex = new RegExp(`\ -^\ -(${proto_re})\ -(${domain_re})\ -(${segments_re})?\ -(${query_re})?\ -#?.*\$\ -`); - -function deconstruct_url(url) -{ - const regex_match = url_regex.exec(url); - if (regex_match === null) - return undefined; +const MAX = { + URL_PATH_LEN: 12, + URL_PATH_CHARS: 255, + DOMAIN_LEN: 7, + DOMAIN_CHARS: 100 +}; - let [_, proto, domain, path, query] = regex_match; +const proto_regex = /^(\w+):\/\/(.*)$/; - domain = domain.split("."); - let path_trailing_dash = - path && path[path.length - 1] === "/"; - path = (path || "").split("/").filter(s => s !== ""); - path.unshift(""); +const user_re = "[^/?#@]+@" +const domain_re = "[.a-zA-Z0-9-]+"; +const path_re = "[^?#]*"; +const query_re = "\\??[^#]*"; - return {proto, domain, path, query, path_trailing_dash}; -} +const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`); + +const file_regex = new RegExp(`^(${path_re}).*`); -/* Be sane: both arguments should be arrays of length >= 2 */ -function domain_matches(url_domain, pattern_domain) +const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`); + +function deconstruct_url(url, use_limits=true) { - const length_difference = url_domain.length - pattern_domain.length; - - for (let i = 1; i <= url_domain.length; i++) { - const url_part = url_domain[url_domain.length - i]; - const pattern_part = pattern_domain[pattern_domain.length - i]; - - if (pattern_domain.length === i) { - if (pattern_part === "*") - return length_difference === 0; - if (pattern_part === "**") - return length_difference > 0; - if (pattern_part === "***") - return true; - return length_difference === 0 && pattern_part === url_part; - } + const max = MAX; + if (!use_limits) { + for (key in MAX) + max[key] = Infinity; + } - if (pattern_part !== url_part) - return false; + const proto_match = proto_regex.exec(url); + if (proto_match === null) + throw `bad url '${url}'`; + + const deco = {proto: proto_match[1]}; + + if (deco.proto === "file") { + deco.path = file_regex.exec(proto_match[2])[1]; + } else if (deco.proto === "ftp") { + [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); + } else if (deco.proto === "http" || deco.proto === "https") { + const http_match = http_regex.exec(proto_match[2]); + if (!http_match) + return undefined; + [deco.domain, deco.path, deco.query] = http_match.slice(1, 4); + deco.domain = deco.domain.toLowerCase(); + } else { + throw `unsupported protocol in url '${url}'`; } - return pattern_domain.length === url_domain.length + 1 && - pattern_domain[0] === "***"; -} + deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; -function path_matches(url_path, url_trailing_dash, - pattern_path, pattern_trailing_dash) -{ - const dashes_ok = !(pattern_trailing_dash && !url_trailing_dash); - - if (pattern_path.length === 0) - return url_path.length === 0 && dashes_ok; - - const length_difference = url_path.length - pattern_path.length; - - for (let i = 0; i < url_path.length; i++) { - if (pattern_path.length === i + 1) { - if (pattern_path[i] === "*") - return length_difference === 0; - if (pattern_path[i] === "**") { - return length_difference > 0 || - (url_path[i] === "**" && dashes_ok); - } - if (pattern_path[i] === "***") - return length_difference >= 0; - return length_difference === 0 && - pattern_path[i] === url_path[i] && dashes_ok; + if (deco.domain) { + if (deco.domain.length > max.DOMAIN_CHARS) { + const idx = deco.domain.indexOf(".", deco.domain.length - + max.DOMAIN_CHARS); + if (idx === -1) + deco.domain = []; + else + deco.domain = deco.domain.substring(idx + 1); + + deco.domain_truncated = true; } - if (pattern_path[i] !== url_path[i]) - return false; + if (deco.path.length > max.URL_PATH_CHARS) { + deco.path = deco.path.substring(0, deco.path.lastIndexOf("/")); + deco.path_truncated = true; + } } - return false; -} - -function url_matches(url, pattern) -{ - const url_deco = deconstruct_url(url); - const pattern_deco = deconstruct_url(pattern); - - if (url_deco === undefined || pattern_deco === undefined) { - console.log(`bad comparison: ${url} and ${pattern}`); - return false + if (typeof deco.domain === "string") { + deco.domain = deco.domain.split("."); + if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length + > 0) + deco.domain_truncated = true; } - if (pattern_deco.proto !== url_deco.proto) - return false; + deco.path = deco.path.split("/").filter(s => s !== ""); + if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0) + deco.path_truncated = true; - return domain_matches(url_deco.domain, pattern_deco.domain) && - path_matches(url_deco.path, url_deco.path_trailing_dash, - pattern_deco.path, pattern_deco.path_trailing_dash); + return deco; } -/* - * Call callback for every possible pattern that matches url. Return when there - * are no more patterns or callback returns false. - */ -function for_each_possible_pattern(url, callback) +function* each_domain_pattern(deco) { - const deco = deconstruct_url(url); - - if (deco === undefined) { - console.log("bad url format", url); - return; + for (let slice = 0; slice < deco.domain.length - 1; slice++) { + const domain_part = deco.domain.slice(slice).join("."); + const domain_wildcards = []; + if (slice === 0 && !deco.domain_truncated) + yield domain_part; + if (slice === 1 && !deco.domain_truncated) + yield "*." + domain_part; + if (slice > 1) + yield "**." + domain_part; + yield "***." + domain_part; } +} - for (let d_slice = 0; d_slice < deco.domain.length; d_slice++) { - const domain_part = deco.domain.slice(d_slice).join("."); - const domain_wildcards = []; - if (d_slice === 0) - domain_wildcards.push(""); - if (d_slice === 1) - domain_wildcards.push("*."); - if (d_slice > 0) - domain_wildcards.push("**."); - domain_wildcards.push("***."); - - for (const domain_wildcard of domain_wildcards) { - const domain_pattern = domain_wildcard + domain_part; - - for (let s_slice = deco.path.length; s_slice > 0; s_slice--) { - const path_part = deco.path.slice(0, s_slice).join("/"); - const path_wildcards = []; - if (s_slice === deco.path.length) { - if (deco.path_trailing_dash) - path_wildcards.push("/"); - path_wildcards.push(""); - } - if (s_slice === deco.path.length - 1 && - deco.path[s_slice] !== "*") - path_wildcards.push("/*"); - if (s_slice < deco.path.length && - (deco.path[s_slice] !== "**" || - s_slice < deco.path.length - 1)) - path_wildcards.push("/**"); - if (deco.path[s_slice] !== "***" || s_slice < deco.path.length) - path_wildcards.push("/***"); - - for (const path_wildcard of path_wildcards) { - const path_pattern = path_part + path_wildcard; - - const pattern = deco.proto + domain_pattern + path_pattern; - - if (callback(pattern) === false) - return; - } - } +function* each_path_pattern(deco) +{ + for (let slice = deco.path.length; slice >= 0; slice--) { + const path_part = ["", ...deco.path.slice(0, slice)].join("/"); + const path_wildcards = []; + if (slice === deco.path.length && !deco.path_truncated) { + if (deco.trailing_dash) + yield path_part + "/"; + if (slice > 0 || deco.proto !== "file") + yield path_part; } + if (slice === deco.path.length - 1 && !deco.path_truncated && + deco.path[slice] !== "*") + yield path_part + "/*"; + if (slice < deco.path.length - 1) + yield path_part + "/**"; + if (slice !== deco.path.length - 1 || deco.path_truncated || + deco.path[slice] !== "***") + yield path_part + "/***"; } } -function possible_patterns(url) +/* Generate every possible pattern that matches url. */ +function* each_url_pattern(url) { - const patterns = []; - for_each_possible_pattern(url, patterns.push); + const deco = deconstruct_url(url); - return patterns; + if (deco === undefined) { + console.error("bad url format", url); + return false; + } + + const all_domains = deco.domain ? each_domain_pattern(deco) : [""]; + for (const domain of all_domains) { + for (const path of each_path_pattern(deco)) + yield `${deco.proto}://${domain}${path}`; + } } /* * EXPORTS_START - * EXPORT url_matches - * EXPORT for_each_possible_pattern - * EXPORT possible_patterns + * EXPORT each_url_pattern + * EXPORT deconstruct_url * EXPORTS_END */ |