From 43ed7392cdfc734a4304284906b9d0d503381841 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 28 Oct 2022 20:15:30 +0200 Subject: [proxy] make it possible to export embedded documentation as standalone .html files and include these in the binary release tarball --- .../proxy/self_doc/en_US/url_patterns.html.jinja | 409 +++++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 src/hydrilla/proxy/self_doc/en_US/url_patterns.html.jinja (limited to 'src/hydrilla/proxy/self_doc/en_US/url_patterns.html.jinja') diff --git a/src/hydrilla/proxy/self_doc/en_US/url_patterns.html.jinja b/src/hydrilla/proxy/self_doc/en_US/url_patterns.html.jinja new file mode 100644 index 0000000..f3415c5 --- /dev/null +++ b/src/hydrilla/proxy/self_doc/en_US/url_patterns.html.jinja @@ -0,0 +1,409 @@ +{# +SPDX-License-Identifier: GPL-3.0-or-later OR CC-BY-SA-4.0 + +Documentation page describing URL patterns understood by Haketilo. + +This file is part of Hydrilla&Haketilo. + +Copyright (C) 2022 Wojtek Kosior + +Dual licensed under +* GNU General Public License v3.0 or later and +* Creative Commons Attribution Share Alike 4.0 International. + +You can choose to use either of these licenses or both. + + +I, Wojtek Kosior, thereby promise not to sue for violation of this +file's licenses. Although I request that you do not make use of this +code in a proprietary work, I am not going to enforce this in court. +#} +{% extends "doc_base.html.jinja" %} + +{% block title %} URL patterns {% endblock %} + +{% block main %} + {{ big_heading('Haketio URL patterns') }} + + {% call section() %} + {% call paragraph() %} + We want to be able to apply different rules and custom scripts for + different websites. However, merely specifying "do this for all documents + under https://example.com" is not enough. Single site's pages + might differ strongly and require different custom scripts to be + loaded. Always matching against a full URL like + https://example.com/something/somethingelse is also not + a good option. It doesn't allow us to properly handle a site that serves + similar pages for multiple values substituted for + somethingelse. + {% endcall %} + {% endcall %} + + {% call section() %} + {{ medium_heading('Employed solution') }} + + {% call paragraph() %} + Wildcards are being used to address the problem. Each payload and rule in + Haketilo has a URL pattern that specifies to which internet pages it + applies. A URL pattern can be as as simple as literal URL in which case it + only matches itself. It can also contain wildcards in the form of one or + more asterisks (*) that correspond to multiple possible + strings occurring in that place. + {% endcall %} + + {% call paragraph() %} + Wildcards can appear in URL's domain and path that follows it. These 2 + types of wildcards are handled separately. + {% endcall %} + {% endcall %} + + {% call section() %} + {{ small_heading('Domain wildcards') }} + + {% call paragraph() %} + A domain wildcard takes the form of one, two or three asterisks occurring + in place of a single domain name segment at the beginning + (left). Depending on the number of asterisks, the meaning is as follows + {% endcall %} + + {% call unordered_list() %} + {% call list_entry() %} + no asterisks (e.g. example.com) - match domain name exactly + (e.g. example.com) + {% endcall %} + {% call list_entry() %} + one asterisk (e.g. *.example.com) - match all domains + resulting from substituting * with a + single segment (e.g. + banana.example.com or pineapple.example.com + but not pineapple.pen.example.com + nor example.com) + {% endcall %} + {% call list_entry() %} + two asterisks (e.g. **.example.com) - match all domains + resulting from substituting ** with + two or more segments (e.g. + monad.breakfast.example.com or + pure.monad.breakfast.example.com but + not cabalhell.example.com nor + example.com) + {% endcall %} + {% call list_entry() %} + three asterisks (e.g. ***.example.com) - match all domains + resulting from substituting *** with + zero or more segments (e.g. + hello.parkmeter.example.com or + iliketrains.example.com or example.com) + {% endcall %} + {% endcall %} + {% endcall %} + + {% call section() %} + {{ small_heading('Path wildcards') }} + + {% call paragraph() %} + A path wildcard takes the form of one, two or three asterisks occurring in + place of a single path segment at the end of path (right). Depending on + the number of asterisks, the meaning is as follows + {% endcall %} + + {% call unordered_list() %} + {% call list_entry() %} + no asterisks (e.g. /joke/clowns) - match path exactly (e.g. + /joke/clowns) + {% endcall %} + {% call list_entry() %} + one asterisk (e.g. /itscalled/*) - match all paths + resulting from substituting * with a + single segment (e.g. + /itscalled/gnulinux or /itscalled/glamp but + not /itscalled/ nor + /itscalled/gnu/linux) + {% endcall %} + {% call list_entry() %} + two asterisks (e.g. /another/**) - match all paths + resulting from substituting ** with + two or more segments (e.g. + /another/nsa/backdoor or + /another/best/programming/language but + not /another/apibreak nor + /another) + {% endcall %} + {% call list_entry() %} + three asterisks (e.g. /mail/dmarc/***) - match all paths + resulting from substituting *** with + zero or more segments (e.g. + /mail/dmarc/spf, /mail/dmarc or + /mail/dmarc/dkim/failure but + not /mail/) + {% endcall %} + {% endcall %} + + {% call paragraph() %} + If pattern ends without a trailing slash, it + mathes paths with any number of trailing slashes, including zero. If + pattern ends with a trailing slash, it only + mathes paths with one or more trailing slashes. For example, + /itscalled/* matches /itscalled/gnulinux, + /itscalled/gnulinux/ and /itscalled/gnulinux// + while /itscalled/*/ only matches + /itscalled/gnulinux/ and /itscalled/gnulinux// + out of those three. + {% endcall %} + + {% call paragraph() %} + If two patterns only differ by the presence of a trailing slash, + pattern with a trailing slash is considered + more specific. + {% endcall %} + + {% call paragraph() %} + Additionally, any path with literal trailing asterisks is matched by + itself, even if such pattern would otherwise be treated as wildcard + (e.g. /gobacktoxul/** matches /gobacktoxul/**). + This is likely to change in the future and would best not be relied upon. + Appending three additional asterisks to path pattern to represent literal + asterisks is being considered. + {% endcall %} + {% endcall %} + + {% call section() %} + {{ small_heading('URL scheme wildcard') }} + + {% call paragraph() %} + http:// and https:// shemes in the URL are + matched exactly. However, starting with Haketilo 3.0, it is also possible + for scheme pseudo-wildcard of http*:// to be used. Use of URL + pattern with this scheme is equivalent to the use of 2 separate patterns + starting with http:// and https://, + respectively. For example, pattern http*://example.com shall + match both https://example.com and + http://example.com. + {% endcall %} + + {% call paragraph() %} + http*:// may be considered not to be a true wildcard but + rather an alias for either of the other 2 values. As of Haketilo 3.0, the + speicificity of a URL pattern starting with http*:// is + considered to be the same as that of the corresponding URL pattern + starting with http:// or https://. In case of a + conflict, the order of precedence of such patterns is unspecified. This + behavior is likely to change in the future versions of Haketilo. + {% endcall %} + {% endcall %} + + {% call section() %} + {{ small_heading('Wildcard pattern priorities and querying') }} + + {% call paragraph() %} + In case multiple patterns match some URL, the more specific one is + preferred. Specificity is considered as follows + {% endcall %} + + {% call unordered_list() %} + {% call list_entry() %} + If patterns only differ in the final path segment, the one with least + wildcard asterisks in that segment if preferred. + {% endcall %} + {% call list_entry() %} + If patterns, besides the above, only differ in path length, one with + longer path is preferred. Neither final wildcard segment nor trailing + dashes account for path length. + {% endcall %} + {% call list_entry() %} + If patterns, besides the above, only differ in the initial domain + segment, one with least wildcard asterisks in that segment is preferred. + {% endcall %} + {% call list_entry() %} + If patterns differ in domain length, one with longer domain is + preferred. Initial wildcard segment does not account for domain length. + {% endcall %} + {% endcall %} + + {% call paragraph() %} + As an example, consider the URL + http://settings.query.example.com/google/tries/destroy/adblockers//. + Patterns matching it are, in the following order + {% endcall %} + + {% call verbatim() %} +http://settings.query.example.com/google/tries/destroy/adblockers/ +http://settings.query.example.com/google/tries/destroy/adblockers +http://settings.query.example.com/google/tries/destroy/adblockers/***/ +http://settings.query.example.com/google/tries/destroy/adblockers/*** +http://settings.query.example.com/google/tries/destroy/*/ +http://settings.query.example.com/google/tries/destroy/* +http://settings.query.example.com/google/tries/destroy/***/ +http://settings.query.example.com/google/tries/destroy/*** +http://settings.query.example.com/google/tries/**/ +http://settings.query.example.com/google/tries/** +http://settings.query.example.com/google/tries/***/ +http://settings.query.example.com/google/tries/*** +http://settings.query.example.com/google/**/ +http://settings.query.example.com/google/** +http://settings.query.example.com/google/***/ +http://settings.query.example.com/google/*** +http://settings.query.example.com/**/ +http://settings.query.example.com/** +http://settings.query.example.com/***/ +http://settings.query.example.com/*** +http://***.settings.query.example.com/google/tries/destroy/adblockers/ +http://***.settings.query.example.com/google/tries/destroy/adblockers +http://***.settings.query.example.com/google/tries/destroy/adblockers/***/ +http://***.settings.query.example.com/google/tries/destroy/adblockers/*** +http://***.settings.query.example.com/google/tries/destroy/*/ +http://***.settings.query.example.com/google/tries/destroy/* +http://***.settings.query.example.com/google/tries/destroy/***/ +http://***.settings.query.example.com/google/tries/destroy/*** +http://***.settings.query.example.com/google/tries/**/ +http://***.settings.query.example.com/google/tries/** +http://***.settings.query.example.com/google/tries/***/ +http://***.settings.query.example.com/google/tries/*** +http://***.settings.query.example.com/google/**/ +http://***.settings.query.example.com/google/** +http://***.settings.query.example.com/google/***/ +http://***.settings.query.example.com/google/*** +http://***.settings.query.example.com/**/ +http://***.settings.query.example.com/** +http://***.settings.query.example.com/***/ +http://***.settings.query.example.com/*** +http://*.query.example.com/google/tries/destroy/adblockers/ +http://*.query.example.com/google/tries/destroy/adblockers +http://*.query.example.com/google/tries/destroy/adblockers/***/ +http://*.query.example.com/google/tries/destroy/adblockers/*** +http://*.query.example.com/google/tries/destroy/*/ +http://*.query.example.com/google/tries/destroy/* +http://*.query.example.com/google/tries/destroy/***/ +http://*.query.example.com/google/tries/destroy/*** +http://*.query.example.com/google/tries/**/ +http://*.query.example.com/google/tries/** +http://*.query.example.com/google/tries/***/ +http://*.query.example.com/google/tries/*** +http://*.query.example.com/google/**/ +http://*.query.example.com/google/** +http://*.query.example.com/google/***/ +http://*.query.example.com/google/*** +http://*.query.example.com/**/ +http://*.query.example.com/** +http://*.query.example.com/***/ +http://*.query.example.com/*** +http://***.query.example.com/google/tries/destroy/adblockers/ +http://***.query.example.com/google/tries/destroy/adblockers +http://***.query.example.com/google/tries/destroy/adblockers/***/ +http://***.query.example.com/google/tries/destroy/adblockers/*** +http://***.query.example.com/google/tries/destroy/*/ +http://***.query.example.com/google/tries/destroy/* +http://***.query.example.com/google/tries/destroy/***/ +http://***.query.example.com/google/tries/destroy/*** +http://***.query.example.com/google/tries/**/ +http://***.query.example.com/google/tries/** +http://***.query.example.com/google/tries/***/ +http://***.query.example.com/google/tries/*** +http://***.query.example.com/google/**/ +http://***.query.example.com/google/** +http://***.query.example.com/google/***/ +http://***.query.example.com/google/*** +http://***.query.example.com/**/ +http://***.query.example.com/** +http://***.query.example.com/***/ +http://***.query.example.com/*** +http://**.example.com/google/tries/destroy/adblockers/ +http://**.example.com/google/tries/destroy/adblockers +http://**.example.com/google/tries/destroy/adblockers/***/ +http://**.example.com/google/tries/destroy/adblockers/*** +http://**.example.com/google/tries/destroy/*/ +http://**.example.com/google/tries/destroy/* +http://**.example.com/google/tries/destroy/***/ +http://**.example.com/google/tries/destroy/*** +http://**.example.com/google/tries/**/ +http://**.example.com/google/tries/** +http://**.example.com/google/tries/***/ +http://**.example.com/google/tries/*** +http://**.example.com/google/**/ +http://**.example.com/google/** +http://**.example.com/google/***/ +http://**.example.com/google/*** +http://**.example.com/**/ +http://**.example.com/** +http://**.example.com/***/ +http://**.example.com/*** +http://***.example.com/google/tries/destroy/adblockers/ +http://***.example.com/google/tries/destroy/adblockers +http://***.example.com/google/tries/destroy/adblockers/***/ +http://***.example.com/google/tries/destroy/adblockers/*** +http://***.example.com/google/tries/destroy/*/ +http://***.example.com/google/tries/destroy/* +http://***.example.com/google/tries/destroy/***/ +http://***.example.com/google/tries/destroy/*** +http://***.example.com/google/tries/**/ +http://***.example.com/google/tries/** +http://***.example.com/google/tries/***/ +http://***.example.com/google/tries/*** +http://***.example.com/google/**/ +http://***.example.com/google/** +http://***.example.com/google/***/ +http://***.example.com/google/*** +http://***.example.com/**/ +http://***.example.com/** +http://***.example.com/***/ +http://***.example.com/*** + {% endcall %} + + {% call paragraph() %} + Variants of those patterns starting with http*:// would of + course match as well. They have been omitted for simplicity. + {% endcall %} + + {% call paragraph() %} + For a simpler URL like https://example.com the patterns would + be + {% endcall %} + + {% call verbatim() %} +https://example.com +https://example.com/*** +https://***.example.com +https://***.example.com/*** + {% endcall %} + + {% call paragraph() %} + Variants of those patterns with a trailing dash added + would not match the URL. Also, the pattern + variants starting with http*:// have been once again omitted. + {% endcall %} + {% endcall %} + + {% call section() %} + {{ small_heading('Limits') }} + + {% call paragraph() %} + In order to prevent some easy-to-conduct DoS attacks, older versions of + Haketilo and Hydrilla limited the lengths of domain and path parts of + processed URLs. This is no longer the case. + {% endcall %} + {% endcall %} + + {% call section() %} + {{ medium_heading('Alternative solution idea: mimicking web server mechanics') }} + + {% call paragraph() %} + While wildcard patterns as presented give a lot of flexibility, they are + not the only viable approach to specifying what URLs to apply + rules/payloads to. In fact, wildcards are different from how the server + side of a typical website decides what to return for a given URL request. + {% endcall %} + + {% call paragraph() %} + In a typical scenario, an HTTP server like Apache reads configuration + files provided by its administrator and uses various (virtual host, + redirect, request rewrite, CGI, etc.) instructions to decide how to handle + given URL. Perhps using a scheme that mimics the configuration options + typically used with web servers would give more efficiency in specifying + what page settings to apply when. + {% endcall %} + + {% call paragraph() %} + This approach may be considered in the future. + {% endcall %} + {% endcall %} +{% endblock main %} -- cgit v1.2.3