/** * Hachette operations on page url patterns * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ const proto_regex = /^(\w+):\/\/(.*)$/; const user_re = "[^/?#@]+@" const domain_re = "[^/?#]+"; const path_re = "[^?#]*"; const query_re = "\\??[^#]*"; const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`); const file_regex = new RegExp(`^(${path_re}).*`); const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`); function deconstruct_url(url) { const proto_match = proto_regex.exec(url); if (proto_match === null) return undefined; const deco = {proto: proto_match[1]}; if (deco.proto === "file") { deco.path = file_regex.exec(proto_match[2])[1]; } else if (deco.proto === "ftp") { [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); } else { const http_match = http_regex.exec(proto_match[2]); if (!http_match) return undefined; [deco.domain, deco.path, deco.query] = http_match.slice(1, 4); } if (deco.domain) deco.domain = deco.domain.split("."); const leading_dash = deco.path[0] === "/"; deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; deco.path = deco.path.split("/").filter(s => s !== ""); if (leading_dash || deco.path.length === 0) deco.path.unshift(""); return deco; } /* Be sane: both arguments should be arrays of length >= 2 */ function domain_matches(url_domain, pattern_domain) { const length_difference = url_domain.length - pattern_domain.length; for (let i = 1; i <= url_domain.length; i++) { const url_part = url_domain[url_domain.length - i]; const pattern_part = pattern_domain[pattern_domain.length - i]; if (pattern_domain.length === i) { if (pattern_part === "*") return length_difference === 0; if (pattern_part === "**") return length_difference > 0; if (pattern_part === "***") return true; return length_difference === 0 && pattern_part === url_part; } if (pattern_part !== url_part) return false; } return pattern_domain.length === url_domain.length + 1 && pattern_domain[0] === "***"; } function path_matches(url_path, url_trailing_dash, pattern_path, pattern_trailing_dash) { const dashes_ok = !(pattern_trailing_dash && !url_trailing_dash); if (pattern_path.length === 0) return url_path.length === 0 && dashes_ok; const length_difference = url_path.length - pattern_path.length; for (let i = 0; i < url_path.length; i++) { if (pattern_path.length === i + 1) { if (pattern_path[i] === "*") return length_difference === 0; if (pattern_path[i] === "**") { return length_difference > 0 || (url_path[i] === "**" && dashes_ok); } if (pattern_path[i] === "***") return length_difference >= 0; return length_difference === 0 && pattern_path[i] === url_path[i] && dashes_ok; } if (pattern_path[i] !== url_path[i]) return false; } return false; } function url_matches(url, pattern) { const url_deco = deconstruct_url(url); const pattern_deco = deconstruct_url(pattern); if (url_deco === undefined || pattern_deco === undefined) { console.log(`bad comparison: ${url} and ${pattern}`); return false } return pattern_deco.proto === url_deco.proto && !(pattern_deco.proto === "file" && pattern_deco.trailing_dash) && !!url_deco.domain === !!pattern_deco.domain && (!url_deco.domain || domain_matches(url_deco.domain, pattern_deco.domain)) && path_matches(url_deco.path, url_deco.trailing_dash, pattern_deco.path, pattern_deco.trailing_dash); } function* each_domain_pattern(domain_segments) { for (let slice = 0; slice < domain_segments.length; slice++) { const domain_part = domain_segments.slice(slice).join("."); const domain_wildcards = []; if (slice === 0) yield domain_part; if (slice === 1) yield "*." + domain_part; if (slice > 1) yield "**." + domain_part; yield "***." + domain_part; } } function* each_path_pattern(path_segments, trailing_dash) { for (let slice = path_segments.length; slice > 0; slice--) { const path_part = path_segments.slice(0, slice).join("/"); const path_wildcards = []; if (slice === path_segments.length) { if (trailing_dash) yield path_part + "/"; yield path_part; } if (slice === path_segments.length - 1 && path_segments[slice] !== "*") yield path_part + "/*"; if (slice < path_segments.length - 1) yield path_part + "/**"; if (slice < path_segments.length - 1 || path_segments[path_segments.length - 1] !== "***") yield path_part + "/***"; } } /* Generate every possible pattern that matches url. */ function* each_url_pattern(url) { const deco = deconstruct_url(url); if (deco === undefined) { console.log("bad url format", url); return false; } const all_domains = deco.domain ? each_domain_pattern(deco.domain) : [""]; for (const domain of all_domains) { for (const path of each_path_pattern(deco.path, deco.trailing_dash)) yield `${deco.proto}://${domain}${path}`; } } /* * EXPORTS_START * EXPORT url_matches * EXPORT each_url_pattern * EXPORTS_END */