/** * This file is part of Haketilo. * * Function: Operations on page URL patterns. * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ const MAX_URL_PATH_LEN = 12; const MAX_URL_PATH_CHARS = 255; const MAX_DOMAIN_LEN = 7; const MAX_DOMAIN_CHARS = 100; const proto_regex = /^(\w+):\/\/(.*)$/; const user_re = "[^/?#@]+@" const domain_re = "[^/?#]+"; const path_re = "[^?#]*"; const query_re = "\\??[^#]*"; const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`); const file_regex = new RegExp(`^(${path_re}).*`); const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`); function deconstruct_url(url) { const proto_match = proto_regex.exec(url); if (proto_match === null) return undefined; const deco = {proto: proto_match[1]}; if (deco.proto === "file") { deco.path = file_regex.exec(proto_match[2])[1]; } else if (deco.proto === "ftp") { [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); } else { const http_match = http_regex.exec(proto_match[2]); if (!http_match) return undefined; [deco.domain, deco.path, deco.query] = http_match.slice(1, 4); } const leading_dash = deco.path[0] === "/"; deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; if (deco.domain) { if (deco.domain.length > MAX_DOMAIN_CHARS) { const idx = deco.domain.indexOf(".", deco.domain.length - MAX_DOMAIN_CHARS); if (idx === -1) deco.domain = []; else deco.domain = deco.domain.substring(idx + 1); deco.domain_truncated = true; } if (deco.path.length > MAX_URL_PATH_CHARS) { deco.path = deco.path.substring(0, deco.path.lastIndexOf("/")); deco.path_truncated = true; } } if (typeof deco.domain === "string") { deco.domain = deco.domain.split("."); if (deco.domain.splice(0, deco.domain.length - MAX_DOMAIN_LEN).length > 0) deco.domain_truncated = true; } deco.path = deco.path.split("/").filter(s => s !== ""); if (deco.domain && deco.path.splice(MAX_URL_PATH_LEN).length > 0) deco.path_truncated = true; if (leading_dash || deco.path.length === 0) deco.path.unshift(""); return deco; } function* each_domain_pattern(deco) { for (let slice = 0; slice < deco.domain.length - 1; slice++) { const domain_part = deco.domain.slice(slice).join("."); const domain_wildcards = []; if (slice === 0 && !deco.domain_truncated) yield domain_part; if (slice === 1 && !deco.domain_truncated) yield "*." + domain_part; if (slice > 1) yield "**." + domain_part; yield "***." + domain_part; } } function* each_path_pattern(deco) { for (let slice = deco.path.length; slice > 0; slice--) { const path_part = deco.path.slice(0, slice).join("/"); const path_wildcards = []; if (slice === deco.path.length && !deco.path_truncated) { if (deco.trailing_dash) yield path_part + "/"; yield path_part; } if (slice === deco.path.length - 1 && !deco.path_truncated && deco.path[slice] !== "*") yield path_part + "/*"; if (slice < deco.path.length - 1) yield path_part + "/**"; if (slice !== deco.path.length - 1 || deco.path_truncated || deco.path[slice] !== "***") yield path_part + "/***"; } } /* Generate every possible pattern that matches url. */ function* each_url_pattern(url) { const deco = deconstruct_url(url); if (deco === undefined) { console.error("bad url format", url); return false; } const all_domains = deco.domain ? each_domain_pattern(deco) : [""]; for (const domain of all_domains) { for (const path of each_path_pattern(deco)) yield `${deco.proto}://${domain}${path}`; } } /* * EXPORTS_START * EXPORT each_url_pattern * EXPORTS_END */