diff options
Diffstat (limited to 'src/hydrilla/proxy/http_messages.py')
-rw-r--r-- | src/hydrilla/proxy/http_messages.py | 33 |
1 files changed, 5 insertions, 28 deletions
diff --git a/src/hydrilla/proxy/http_messages.py b/src/hydrilla/proxy/http_messages.py index 718022f..74f1f02 100644 --- a/src/hydrilla/proxy/http_messages.py +++ b/src/hydrilla/proxy/http_messages.py @@ -30,6 +30,7 @@ """ import re +import cgi import dataclasses as dc import typing as t import sys @@ -120,42 +121,18 @@ def make_parsed_url(url: t.Union[str, url_patterns.ParsedUrl]) \ return url_patterns.parse_url(url) if isinstance(url, str) else url -# For details of 'Content-Type' header's structure, see: -# https://datatracker.ietf.org/doc/html/rfc7231#section-3.1.1.1 -content_type_reg = re.compile(r''' -^ -(?P<mime>[\w-]+/[\w-]+) -\s* -(?: - ; - (?:[^;]*;)* # match possible parameter other than "charset" -) -\s* -charset= # no whitespace allowed in parameter as per RFC -(?P<encoding> - [\w-]+ - | - "[\w-]+" # quotes are optional per RFC -) -(?:;[^;]+)* # match possible parameter other than "charset" -$ # forbid possible dangling characters after closing '"' -''', re.VERBOSE | re.IGNORECASE) - @dc.dataclass(frozen=True) class HasHeadersMixin: headers: IHeaders def deduce_content_type(self) -> tuple[t.Optional[str], t.Optional[str]]: - content_type = self.headers.get('content-type') - if content_type is None: - return (None, None) - - match = content_type_reg.match(content_type) - if match is None: + content_type_header = self.headers.get('content-type') + if content_type_header is None: return (None, None) - mime, encoding = match.group('mime'), match.group('encoding') + mime, options = cgi.parse_header(content_type_header) + encoding = options.get('charset') if encoding is not None: encoding = encoding.lower() |