/** * This file is part of Haketilo. * * Function: Operations on page URL patterns. * * Copyright (C) 2021 Wojtek Kosior * Redistribution terms are gathered in the `copyright' file. */ const MAX = { URL_PATH_LEN: 12, URL_PATH_CHARS: 255, DOMAIN_LEN: 7, DOMAIN_CHARS: 100 }; const proto_regex = /^(\w+):\/\/(.*)$/; const user_re = "[^/?#@]+@" const domain_re = "[.a-zA-Z0-9-]+"; const path_re = "[^?#]*"; const query_re = "\\??[^#]*"; const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`); const file_regex = new RegExp(`^(${path_re}).*`); const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`); function deconstruct_url(url, use_limits=true) { const max = MAX; if (!use_limits) { for (key in MAX) max[key] = Infinity; } const proto_match = proto_regex.exec(url); if (proto_match === null) throw `bad url '${url}'`; const deco = {proto: proto_match[1]}; if (deco.proto === "file") { deco.path = file_regex.exec(proto_match[2])[1]; } else if (deco.proto === "ftp") { [deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); } else if (deco.proto === "http" || deco.proto === "https") { const http_match = http_regex.exec(proto_match[2]); if (!http_match) return undefined; [deco.domain, deco.path, deco.query] = http_match.slice(1, 4); deco.domain = deco.domain.toLowerCase(); } else { throw `unsupported protocol in url '${url}'`; } deco.trailing_dash = deco.path[deco.path.length - 1] === "/"; if (deco.domain) { if (deco.domain.length > max.DOMAIN_CHARS) { const idx = deco.domain.indexOf(".", deco.domain.length - max.DOMAIN_CHARS); if (idx === -1) deco.domain = []; else deco.domain = deco.domain.substring(idx + 1); deco.domain_truncated = true; } if (deco.path.length > max.URL_PATH_CHARS) { deco.path = deco.path.substring(0, deco.path.lastIndexOf("/")); deco.path_truncated = true; } } if (typeof deco.domain === "string") { deco.domain = deco.domain.split("."); if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length > 0) deco.domain_truncated = true; } deco.path = deco.path.split("/").filter(s => s !== ""); if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0) deco.path_truncated = true; return deco; } function* each_domain_pattern(deco) { for (let slice = 0; slice < deco.domain.length - 1; slice++) { const domain_part = deco.domain.slice(slice).join("."); const domain_wildcards = []; if (slice === 0 && !deco.domain_truncated) yield domain_part; if (slice === 1 && !deco.domain_truncated) yield "*." + domain_part; if (slice > 1) yield "**." + domain_part; yield "***." + domain_part; } } function* each_path_pattern(deco) { for (let slice = deco.path.length; slice >= 0; slice--) { const path_part = ["", ...deco.path.slice(0, slice)].join("/"); const path_wildcards = []; if (slice === deco.path.length && !deco.path_truncated) { if (deco.trailing_dash) yield path_part + "/"; if (slice > 0 || deco.proto !== "file") yield path_part; } if (slice === deco.path.length - 1 && !deco.path_truncated && deco.path[slice] !== "*") yield path_part + "/*"; if (slice < deco.path.length - 1) yield path_part + "/**"; if (slice !== deco.path.length - 1 || deco.path_truncated || deco.path[slice] !== "***") yield path_part + "/***"; } } /* Generate every possible pattern that matches url. */ function* each_url_pattern(url) { const deco = deconstruct_url(url); if (deco === undefined) { console.error("bad url format", url); return false; } const all_domains = deco.domain ? each_domain_pattern(deco) : [""]; for (const domain of all_domains) { for (const path of each_path_pattern(deco)) yield `${deco.proto}://${domain}${path}`; } } /* * EXPORTS_START * EXPORT each_url_pattern * EXPORT deconstruct_url * EXPORTS_END */