Restores compatibility with Python >=3.9, which removed the custom .error() method in https://github.com/python/cpython/commit/e34bbfd61f405eef89e8aa50672b0b25022de320 Despite the big diff, only a try…except clause is added. --- source/sgmllib.py 2023-03-18 08:57:58.726240606 +0100 +++ source/sgmllib.py 2023-03-18 09:02:01.667568916 +0100 @@ -101,113 +101,116 @@ """Handle the remaining data.""" self.goahead(1) - def error(self, message): - raise SGMLParseError(message) - # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): - rawdata = self.rawdata - i = 0 - n = len(rawdata) - while i < n: - if self.nomoretags: - self.handle_data(rawdata[i:n]) - i = n - break - match = interesting.search(rawdata, i) - if match: j = match.start() - else: j = n - if i < j: - self.handle_data(rawdata[i:j]) - i = j - if i == n: break - if rawdata[i] == '<': - if starttagopen.match(rawdata, i): + try: + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + self.handle_data(rawdata[i:n]) + i = n + break + match = interesting.search(rawdata, i) + if match: j = match.start() + else: j = n + if i < j: + self.handle_data(rawdata[i:j]) + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + i = k + continue + if rawdata.startswith("</", i): + k = self.parse_endtag(i) + if k < 0: break + i = k + self.literal = 0 + continue + if self.literal: + if n > (i + 1): + self.handle_data("<") + i = i+1 + else: + # incomplete + break + continue + if rawdata.startswith("<!--", i): + # Strictly speaking, a comment is --.*-- + # within a declaration tag <!...>. + # This should be removed, + # and comments handled only in parse_declaration. + k = self.parse_comment(i) + if k < 0: break + i = k + continue + if rawdata.startswith("<?", i): + k = self.parse_pi(i) + if k < 0: break + i = i+k + continue + if rawdata.startswith("<!", i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + k = self.parse_declaration(i) + if k < 0: break + i = k + continue + elif rawdata[i] == '&': if self.literal: self.handle_data(rawdata[i]) i = i+1 continue - k = self.parse_starttag(i) - if k < 0: break - i = k - continue - if rawdata.startswith("</", i): - k = self.parse_endtag(i) - if k < 0: break - i = k - self.literal = 0 - continue - if self.literal: - if n > (i + 1): - self.handle_data("<") - i = i+1 - else: - # incomplete - break - continue - if rawdata.startswith("<!--", i): - # Strictly speaking, a comment is --.*-- - # within a declaration tag <!...>. - # This should be removed, - # and comments handled only in parse_declaration. - k = self.parse_comment(i) - if k < 0: break - i = k - continue - if rawdata.startswith("<?", i): - k = self.parse_pi(i) - if k < 0: break - i = i+k - continue - if rawdata.startswith("<!", i): - # This is some sort of declaration; in "HTML as - # deployed," this should only be the document type - # declaration ("<!DOCTYPE html...>"). - k = self.parse_declaration(i) - if k < 0: break - i = k - continue - elif rawdata[i] == '&': - if self.literal: + match = charref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_charref(name) + i = match.end(0) + if rawdata[i-1] != ';': i = i-1 + continue + match = entityref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_entityref(name) + i = match.end(0) + if rawdata[i-1] != ';': i = i-1 + continue + else: + self.error('neither < nor & ??') + # We get here only if incomplete matches but + # nothing else + match = incomplete.match(rawdata, i) + if not match: self.handle_data(rawdata[i]) i = i+1 continue - match = charref.match(rawdata, i) - if match: - name = match.group(1) - self.handle_charref(name) - i = match.end(0) - if rawdata[i-1] != ';': i = i-1 - continue - match = entityref.match(rawdata, i) - if match: - name = match.group(1) - self.handle_entityref(name) - i = match.end(0) - if rawdata[i-1] != ';': i = i-1 - continue - else: - self.error('neither < nor & ??') - # We get here only if incomplete matches but - # nothing else - match = incomplete.match(rawdata, i) - if not match: - self.handle_data(rawdata[i]) - i = i+1 - continue - j = match.end(0) - if j == n: - break # Really incomplete - self.handle_data(rawdata[i:j]) - i = j - # end while - if end and i < n: - self.handle_data(rawdata[i:n]) - i = n - self.rawdata = rawdata[i:] - # XXX if end: check for empty stack + j = match.end(0) + if j == n: + break # Really incomplete + self.handle_data(rawdata[i:j]) + i = j + # end while + if end and i < n: + self.handle_data(rawdata[i:n]) + i = n + self.rawdata = rawdata[i:] + # XXX if end: check for empty stack + except AssertionError as e: + # The .error() method, which threw the custom SGMLParseError was removed + # by https://github.com/python/cpython/issues/76025. So we have to catch + # _markupbase’s AssertionError and translate it into the old one. + raise SGMLParseError (e.args[0]) from e # Extensions for the DOCTYPE scanner: _decl_otherchars = '='