from . import inputstream from .constants import ( ReparseError, Token, adjust_foreign_attributes, adjust_mathml_attributes, adjust_svg_attributes, ascii_upper_to_lower, cdata_elements, heading_elements, html_integration_point_elements, mathml_text_integration_point_elements, namespaces, rcdata_elements, space_characters, special_elements, ) from .tokenizer import HTMLTokenizer from .treebuilder import Marker, TreeBuilder def parse(document, namespace_html_elements=True, **kwargs): """Parse an HTML document into a tree. :param document: The document to parse as a HTML string, filename, file-like object. :type document: :class:`str`, :class:`bytes`, :class:`pathlib.Path` or :term:`file object` :param bool namespace_html_elements: Whether or not to namespace HTML elements. Extra parameters can be provided to define possible encodings if the document is given as :class:`bytes`. :param override_encoding: Forced encoding provided by user agent. :type override_encoding: str or bytes :param transport_encoding: Encoding provided by transport layout. :type transport_encoding: str or bytes :param same_origin_parent_encoding: Parent document encoding. :type same_origin_parent_encoding: str or bytes :param likely_encoding: Possible encoding provided by user agent. :type likely_encoding: str or bytes :param default_encoding: Encoding used as fallback. :type default_encoding: str or bytes :returns: :class:`xml.etree.ElementTree.Element`. Example: >>> from tinyhtml5 import parse >>> parse('

This is a doc

') """ return HTMLParser(namespace_html_elements).parse(document, **kwargs) class HTMLParser: """HTML parser. Generate a tree structure from a stream of (possibly malformed) HTML. """ def __init__(self, namespace_html_elements=True): self.tree = TreeBuilder(namespace_html_elements) self.errors = [] self.phases = {name: cls(self, self.tree) for name, cls in _phases.items()} def _parse(self, stream, container=None, scripting=False, **kwargs): self.container = container self.scripting = scripting self.tokenizer = HTMLTokenizer(stream, parser=self, **kwargs) self.reset() try: self.main_loop() except ReparseError: self.reset() self.main_loop() def reset(self): self.tree.reset() self.first_start_tag = False self.errors = [] self.compatibility_mode = "no quirks" # or "quirks" or "limited quirks" if self.container: if self.container in cdata_elements: self.tokenizer.state = self.tokenizer.rcdata_state elif self.container in rcdata_elements: self.tokenizer.state = self.tokenizer.rawtext_state elif self.container == 'plaintext': self.tokenizer.state = self.tokenizer.plaintext_state else: # State already is data state. # self.tokenizer.state = self.tokenizer.data_state pass self.phase = self.phases["before html"] self.phase._insert_html_element() self.reset_insertion_mode() else: self.phase = self.phases["initial"] self.last_phase = None self.before_rcdata_phase = None self.frameset_ok = True @property def encoding(self): """Name of the character encoding that was used to decode the input stream. :obj:`None` if that is not determined yet. """ if hasattr(self, 'tokenizer'): return self.tokenizer.stream.encoding[0].name def is_html_integration_point(self, element): full_name = (element.namespace, element.name) if full_name == (namespaces["mathml"], "annotation-xml"): return ( "encoding" in element.attributes and element.attributes["encoding"].translate(ascii_upper_to_lower) in ("text/html", "application/xhtml+xml")) return full_name in html_integration_point_elements def is_mathml_text_integration_point(self, element): full_name = (element.namespace, element.name) return full_name in mathml_text_integration_point_elements def main_loop(self): for token in self.tokenizer: previous_token = None new_token = token while new_token is not None: previous_token = new_token current_node = ( self.tree.open_elements[-1] if self.tree.open_elements else None) current_node_namespace = ( current_node.namespace if current_node else None) current_node_name = current_node.name if current_node else None type = new_token["type"] if type == Token.PARSE_ERROR: self.parse_error(new_token["data"], new_token.get("datavars", {})) new_token = None else: if (len(self.tree.open_elements) == 0 or current_node_namespace == self.tree.default_namespace or (self.is_mathml_text_integration_point(current_node) and ((type == Token.START_TAG and token["name"] not in frozenset(["mglyph", "malignmark"])) or type in (Token.CHARACTERS, Token.SPACE_CHARACTERS))) or (current_node_namespace == namespaces["mathml"] and current_node_name == "annotation-xml" and type == Token.START_TAG and token["name"] == "svg") or (self.is_html_integration_point(current_node) and type in ( Token.START_TAG, Token.CHARACTERS, Token.SPACE_CHARACTERS))): phase = self.phase else: phase = self.phases["in foreign content"] if type == Token.CHARACTERS: new_token = phase.process_characters(new_token) elif type == Token.SPACE_CHARACTERS: new_token = phase.process_space_characters(new_token) elif type == Token.START_TAG: new_token = phase.process_start_tag(new_token) elif type == Token.END_TAG: new_token = phase.process_end_tag(new_token) elif type == Token.COMMENT: new_token = phase.process_comment(new_token) elif type == Token.DOCTYPE: new_token = phase.process_doctype(new_token) if (type == Token.START_TAG and previous_token["selfClosing"] and not previous_token["selfClosingAcknowledged"]): self.parse_error( "non-void-element-with-trailing-solidus", {"name": previous_token["name"]}) # When the loop finishes it's EOF. reprocess = True phases = [] while reprocess: phases.append(self.phase) reprocess = self.phase.process_eof() if reprocess: assert self.phase not in phases def parse(self, stream, full_tree=False, **kwargs): """Parse a HTML document into a well-formed tree. If ``full_tree`` is ``True``, return the whole tree. """ self._parse(stream, **kwargs) return self.tree.get_document(full_tree) def parse_fragment(self, stream, container="div", **kwargs): """Parse a HTML fragment into a well-formed tree fragment. ``container`` is the tag name of the fragment’s container. """ self._parse(stream, container=container, **kwargs) return self.tree.get_fragment() def parse_error(self, errorcode, datavars=None): if datavars is None: datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) def adjust_mathml_attributes(self, token): adjust_attributes(token, adjust_mathml_attributes) def adjust_svg_attributes(self, token): adjust_attributes(token, adjust_svg_attributes) def adjust_foreign_attributes(self, token): adjust_attributes(token, adjust_foreign_attributes) def reset_insertion_mode(self): # The name of this method is mostly historical. (It's also used in the # specification.) last = False new_modes = { "select": "in select", "td": "in cell", "th": "in cell", "tr": "in row", "tbody": "in table body", "thead": "in table body", "tfoot": "in table body", "caption": "in caption", "colgroup": "in column group", "table": "in table", "head": "in body", "body": "in body", "frameset": "in frameset", "html": "before head" } for node in self.tree.open_elements[::-1]: node_name = node.name new_phase = None if node == self.tree.open_elements[0]: assert self.container last = True node_name = self.container # Check for conditions that should only happen in the fragment case. if node_name in ("select", "colgroup", "head", "html"): assert self.container if not last and node.namespace != self.tree.default_namespace: continue if node_name in new_modes: new_phase = self.phases[new_modes[node_name]] break elif last: new_phase = self.phases["in body"] break self.phase = new_phase def parse_rcdata_rawtext(self, token, content_type): # Generic RCDATA/RAWTEXT Parsing algorithm. assert content_type in ("RAWTEXT", "RCDATA") self.tree.insert_element(token) if content_type == "RAWTEXT": self.tokenizer.state = self.tokenizer.rawtext_state else: self.tokenizer.state = self.tokenizer.rcdata_state self.original_phase = self.phase self.phase = self.phases["text"] def dispatch(items): return { key: value for keys, value in items for key in ((keys,) if isinstance(keys, str) else keys) } class Phase: """Base class for helper that implements each phase of processing.""" __slots__ = ("parser", "tree", "__start_tag_cache", "__end_tag_cache") def __init__(self, parser, tree): self.parser = parser self.tree = tree self.__start_tag_cache = {} self.__end_tag_cache = {} def process_eof(self): # pragma: no cover raise NotImplementedError def process_comment(self, token): # For most phases the following is correct. Where it's not it will be # overridden. self.tree.insert_comment(token, self.tree.open_elements[-1]) def process_doctype(self, token): self.parser.parse_error("unexpected-doctype") def process_characters(self, token): self.tree.insert_text(token["data"]) def process_space_characters(self, token): self.tree.insert_text(token["data"]) def process_start_tag(self, token): name = token["name"] # In Py3, `in` is quicker when there are few cache hits (typically # short inputs). if name in self.__start_tag_cache: function = self.__start_tag_cache[name] else: function = self.__start_tag_cache[name] = self.start_tag_handler.get( name, type(self).start_tag_other) # Bound the cache size in case we get loads of unknown tags. while len(self.__start_tag_cache) > len(self.start_tag_handler) * 1.1: # This makes the eviction policy random on Py < 3.7 and FIFO >= 3.7. self.__start_tag_cache.pop(next(iter(self.__start_tag_cache))) return function(self, token) def start_tag_html(self, token): if not self.parser.first_start_tag and token["name"] == "html": self.parser.parse_error("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parse_error(). for attr, value in token["data"].items(): if attr not in self.tree.open_elements[0].attributes: self.tree.open_elements[0].attributes[attr] = value self.parser.first_start_tag = False def process_end_tag(self, token): name = token["name"] # In Py3, `in` is quicker when there are few cache hits (typically # short inputs). if name in self.__end_tag_cache: function = self.__end_tag_cache[name] else: function = self.__end_tag_cache[name] = self.end_tag_handler.get( name, type(self).end_tag_other) # Bound the cache size in case we get loads of unknown tags. while len(self.__end_tag_cache) > len(self.end_tag_handler) * 1.1: # This makes the eviction policy random on Py < 3.7 and FIFO >= 3.7. self.__end_tag_cache.pop(next(iter(self.__end_tag_cache))) return function(self, token) class InitialPhase(Phase): __slots__ = tuple() def process_space_characters(self, token): pass def process_comment(self, token): self.tree.insert_comment(token, self.tree.document) def process_doctype(self, token): name = token["name"] public_id = token["publicId"] system_id = token["systemId"] correct = token["correct"] if (name != "html" or public_id is not None or system_id is not None and system_id != "about:legacy-compat"): self.parser.parse_error("unknown-doctype") if public_id is None: public_id = "" self.tree.insert_doctype(token) if public_id != "": public_id = public_id.translate(ascii_upper_to_lower) if (not correct or token["name"] != "html" or public_id.startswith( ("+//silmaril//dtd html pro v0r11 19970101//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", "-//as//dtd html 3.0 aswedit + extensions//", "-//ietf//dtd html 2.0 level 1//", "-//ietf//dtd html 2.0 level 2//", "-//ietf//dtd html 2.0 strict level 1//", "-//ietf//dtd html 2.0 strict level 2//", "-//ietf//dtd html 2.0 strict//", "-//ietf//dtd html 2.0//", "-//ietf//dtd html 2.1e//", "-//ietf//dtd html 3.0//", "-//ietf//dtd html 3.2 final//", "-//ietf//dtd html 3.2//", "-//ietf//dtd html 3//", "-//ietf//dtd html level 0//", "-//ietf//dtd html level 1//", "-//ietf//dtd html level 2//", "-//ietf//dtd html level 3//", "-//ietf//dtd html strict level 0//", "-//ietf//dtd html strict level 1//", "-//ietf//dtd html strict level 2//", "-//ietf//dtd html strict level 3//", "-//ietf//dtd html strict//", "-//ietf//dtd html//", "-//metrius//dtd metrius presentational//", "-//microsoft//dtd internet explorer 2.0 html strict//", "-//microsoft//dtd internet explorer 2.0 html//", "-//microsoft//dtd internet explorer 2.0 tables//", "-//microsoft//dtd internet explorer 3.0 html strict//", "-//microsoft//dtd internet explorer 3.0 html//", "-//microsoft//dtd internet explorer 3.0 tables//", "-//netscape comm. corp.//dtd html//", "-//netscape comm. corp.//dtd strict html//", "-//o'reilly and associates//dtd html 2.0//", "-//o'reilly and associates//dtd html extended 1.0//", "-//o'reilly and associates//dtd html extended relaxed 1.0//", "-//softquad software//dtd hotmetal pro 6.0::19990601::" "extensions to html 4.0//", "-//softquad//dtd hotmetal pro 4.0::19971010::" "extensions to html 4.0//", "-//spyglass//dtd html 2.0 extended//", "-//sq//dtd html 2.0 hotmetal + extensions//", "-//sun microsystems corp.//dtd hotjava html//", "-//sun microsystems corp.//dtd hotjava strict html//", "-//w3c//dtd html 3 1995-03-24//", "-//w3c//dtd html 3.2 draft//", "-//w3c//dtd html 3.2 final//", "-//w3c//dtd html 3.2//", "-//w3c//dtd html 3.2s draft//", "-//w3c//dtd html 4.0 frameset//", "-//w3c//dtd html 4.0 transitional//", "-//w3c//dtd html experimental 19960712//", "-//w3c//dtd html experimental 970421//", "-//w3c//dtd w3 html//", "-//w3o//dtd w3 html 3.0//", "-//webtechs//dtd mozilla html 2.0//", "-//webtechs//dtd mozilla html//")) or public_id in ("-//w3o//dtd w3 html strict 3.0//en//", "-/w3c/dtd html 4.0 transitional/en", "html") or public_id.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and system_id is None or system_id and system_id.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatibility_mode = "quirks" elif (public_id.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", "-//w3c//dtd xhtml 1.0 transitional//")) or public_id.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and system_id is not None): self.parser.compatibility_mode = "limited quirks" self.parser.phase = self.parser.phases["before html"] def anything_else(self): self.parser.compatibility_mode = "quirks" self.parser.phase = self.parser.phases["before html"] def process_characters(self, token): self.parser.parse_error("expected-doctype-but-got-chars") self.anything_else() return token def process_start_tag(self, token): self.parser.parse_error( "expected-doctype-but-got-start-tag", {"name": token["name"]}) self.anything_else() return token def process_end_tag(self, token): self.parser.parse_error( "expected-doctype-but-got-end-tag", {"name": token["name"]}) self.anything_else() return token def process_eof(self): self.parser.parse_error("expected-doctype-but-got-eof") self.anything_else() return True class BeforeHtmlPhase(Phase): __slots__ = tuple() def _insert_html_element(self): self.tree.insert_root(implied_tag_token("html", "START_TAG")) self.parser.phase = self.parser.phases["before head"] def process_eof(self): self._insert_html_element() return True def process_comment(self, token): self.tree.insert_comment(token, self.tree.document) def process_space_characters(self, token): pass def process_characters(self, token): self._insert_html_element() return token def process_start_tag(self, token): if token["name"] == "html": self.parser.first_start_tag = True self._insert_html_element() return token def process_end_tag(self, token): if token["name"] not in ("head", "body", "html", "br"): self.parser.parse_error( "unexpected-end-tag-before-html", {"name": token["name"]}) else: self._insert_html_element() return token class BeforeHeadPhase(Phase): __slots__ = tuple() def process_eof(self): self.start_tag_head(implied_tag_token("head", "START_TAG")) return True def process_space_characters(self, token): pass def process_characters(self, token): self.start_tag_head(implied_tag_token("head", "START_TAG")) return token def start_tag_html(self, token): return self.parser.phases["in body"].process_start_tag(token) def start_tag_head(self, token): self.tree.insert_element(token) self.tree.head_element = self.tree.open_elements[-1] self.parser.phase = self.parser.phases["in head"] def start_tag_other(self, token): self.start_tag_head(implied_tag_token("head", "START_TAG")) return token def end_tag_imply_head(self, token): self.start_tag_head(implied_tag_token("head", "START_TAG")) return token def end_tag_other(self, token): self.parser.parse_error("end-tag-after-implied-root", {"name": token["name"]}) start_tag_handler = dispatch([ ("html", start_tag_html), ("head", start_tag_head) ]) end_tag_handler = dispatch([ (("head", "body", "html", "br"), end_tag_imply_head) ]) class InHeadPhase(Phase): __slots__ = tuple() # the real thing def process_eof(self): self.anything_else() return True def process_characters(self, token): self.anything_else() return token def start_tag_html(self, token): return self.parser.phases["in body"].process_start_tag(token) def start_tag_head(self, token): self.parser.parse_error("two-heads-are-not-better-than-one") def start_tag_base_link_command(self, token): self.tree.insert_element(token) self.tree.open_elements.pop() token["selfClosingAcknowledged"] = True def start_tag_meta(self, token): self.tree.insert_element(token) self.tree.open_elements.pop() token["selfClosingAcknowledged"] = True attributes = token["data"] if self.parser.tokenizer.stream.encoding[1] == "tentative": if "charset" in attributes: self.parser.tokenizer.stream.change_encoding(attributes["charset"]) elif ("content" in attributes and "http-equiv" in attributes and attributes["http-equiv"].lower() == "content-type"): # Encoding it as UTF-8 here is a hack, as really we should pass # the abstract Unicode string, and just use the # ContentAttributeParser on that, but using UTF-8 allows all chars # to be encoded and as a ASCII-superset works. data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) parser = inputstream.ContentAttributeParser(data) codec = parser.parse() self.parser.tokenizer.stream.change_encoding(codec) def start_tag_title(self, token): self.parser.parse_rcdata_rawtext(token, "RCDATA") def start_tag_noframes_style(self, token): # Need to decide whether to implement the scripting-disabled case self.parser.parse_rcdata_rawtext(token, "RAWTEXT") def start_tag_noscript(self, token): if self.parser.scripting: self.parser.parse_rcdata_rawtext(token, "RAWTEXT") else: self.tree.insert_element(token) self.parser.phase = self.parser.phases["in head noscript"] def start_tag_script(self, token): self.tree.insert_element(token) self.parser.tokenizer.state = self.parser.tokenizer.script_data_state self.parser.original_phase = self.parser.phase self.parser.phase = self.parser.phases["text"] def start_tag_other(self, token): self.anything_else() return token def end_tag_head(self, token): node = self.parser.tree.open_elements.pop() assert node.name == "head", "Expected head got %s" % node.name self.parser.phase = self.parser.phases["after head"] def end_tag_html_body_br(self, token): self.anything_else() return token def end_tag_other(self, token): self.parser.parse_error("unexpected-end-tag", {"name": token["name"]}) def anything_else(self): self.end_tag_head(implied_tag_token("head")) start_tag_handler = dispatch([ ("html", start_tag_html), ("title", start_tag_title), (("noframes", "style"), start_tag_noframes_style), ("noscript", start_tag_noscript), ("script", start_tag_script), (("base", "basefont", "bgsound", "command", "link"), start_tag_base_link_command), ("meta", start_tag_meta), ("head", start_tag_head) ]) end_tag_handler = dispatch([ ("head", end_tag_head), (("br", "html", "body"), end_tag_html_body_br) ]) class InHeadNoscriptPhase(Phase): __slots__ = tuple() def process_eof(self): self.parser.parse_error("eof-in-head-noscript") self.anything_else() return True def process_comment(self, token): return self.parser.phases["in head"].process_comment(token) def process_characters(self, token): self.parser.parse_error("char-in-head-noscript") self.anything_else() return token def process_space_characters(self, token): return self.parser.phases["in head"].process_space_characters(token) def start_tag_html(self, token): return self.parser.phases["in body"].process_start_tag(token) def start_tag_base_link_command(self, token): return self.parser.phases["in head"].process_start_tag(token) def start_tag_head_noscript(self, token): self.parser.parse_error("unexpected-start-tag", {"name": token["name"]}) def start_tag_other(self, token): self.parser.parse_error( "unexpected-inhead-noscript-tag", {"name": token["name"]}) self.anything_else() return token def end_tag_noscript(self, token): node = self.parser.tree.open_elements.pop() assert node.name == "noscript", f"Expected noscript got {node.name}" self.parser.phase = self.parser.phases["in head"] def end_tag_br(self, token): self.parser.parse_error( "unexpected-inhead-noscript-tag", {"name": token["name"]}) self.anything_else() return token def end_tag_other(self, token): self.parser.parse_error("unexpected-end-tag", {"name": token["name"]}) def anything_else(self): # Caller must raise parse error first! self.end_tag_noscript(implied_tag_token("noscript")) start_tag_handler = dispatch([ ("html", start_tag_html), (("basefont", "bgsound", "link", "meta", "noframes", "style"), start_tag_base_link_command), (("head", "noscript"), start_tag_head_noscript), ]) end_tag_handler = dispatch([ ("noscript", end_tag_noscript), ("br", end_tag_br), ]) class AfterHeadPhase(Phase): __slots__ = tuple() def process_eof(self): self.anything_else() return True def process_characters(self, token): self.anything_else() return token def start_tag_html(self, token): return self.parser.phases["in body"].process_start_tag(token) def start_tag_body(self, token): self.parser.frameset_ok = False self.tree.insert_element(token) self.parser.phase = self.parser.phases["in body"] def start_tag_frameset(self, token): self.tree.insert_element(token) self.parser.phase = self.parser.phases["in frameset"] def start_tag_from_head(self, token): self.parser.parse_error( "unexpected-start-tag-out-of-my-head", {"name": token["name"]}) self.tree.open_elements.append(self.tree.head_element) self.parser.phases["in head"].process_start_tag(token) for node in self.tree.open_elements[::-1]: if node.name == "head": self.tree.open_elements.remove(node) break def start_tag_head(self, token): self.parser.parse_error("unexpected-start-tag", {"name": token["name"]}) def start_tag_other(self, token): self.anything_else() return token def end_tag_html_body_br(self, token): self.anything_else() return token def end_tag_other(self, token): self.parser.parse_error("unexpected-end-tag", {"name": token["name"]}) def anything_else(self): self.tree.insert_element(implied_tag_token("body", "START_TAG")) self.parser.phase = self.parser.phases["in body"] self.parser.frameset_ok = True start_tag_handler = dispatch([ ("html", start_tag_html), ("body", start_tag_body), ("frameset", start_tag_frameset), (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title"), start_tag_from_head), ("head", start_tag_head) ]) end_tag_handler = dispatch([ (("body", "html", "br"), end_tag_html_body_br) ]) class InBodyPhase(Phase): # https://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # The really-really-really-very crazy mode. __slots__ = ("process_space_characters",) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Set this to the default handler. self.process_space_characters = self.process_space_characters_non_pre def is_matching_formatting_element(self, node1, node2): return ( node1.name == node2.name and node1.namespace == node2.namespace and node1.attributes == node2.attributes) def add_formatting_element(self, token): self.tree.insert_element(token) element = self.tree.open_elements[-1] matching_elements = [] for node in self.tree.active_formatting_elements[::-1]: if node is Marker: break elif self.is_matching_formatting_element(node, element): matching_elements.append(node) assert len(matching_elements) <= 3 if len(matching_elements) == 3: self.tree.active_formatting_elements.remove(matching_elements[-1]) self.tree.active_formatting_elements.append(element) # The real deal. def process_eof(self): allowed_elements = frozenset(( "dd", "dt", "li", "p", "tbody", "td", "tfoot", "th", "thead", "tr", "body", "html")) for node in self.tree.open_elements[::-1]: if node.name not in allowed_elements: self.parser.parse_error("expected-closing-tag-but-got-eof") break # Stop parsing. def process_space_characters_drop_newline(self, token): # Sometimes (start of
, , and