from . import inputstream
from .constants import (
ReparseError,
Token,
adjust_foreign_attributes,
adjust_mathml_attributes,
adjust_svg_attributes,
ascii_upper_to_lower,
cdata_elements,
heading_elements,
html_integration_point_elements,
mathml_text_integration_point_elements,
namespaces,
rcdata_elements,
space_characters,
special_elements,
)
from .tokenizer import HTMLTokenizer
from .treebuilder import Marker, TreeBuilder
def parse(document, namespace_html_elements=True, **kwargs):
"""Parse an HTML document into a tree.
:param document:
The document to parse as a HTML string, filename, file-like object.
:type document:
:class:`str`, :class:`bytes`, :class:`pathlib.Path` or
:term:`file object`
:param bool namespace_html_elements:
Whether or not to namespace HTML elements.
Extra parameters can be provided to define possible encodings if the
document is given as :class:`bytes`.
:param override_encoding: Forced encoding provided by user agent.
:type override_encoding: str or bytes
:param transport_encoding: Encoding provided by transport layout.
:type transport_encoding: str or bytes
:param same_origin_parent_encoding: Parent document encoding.
:type same_origin_parent_encoding: str or bytes
:param likely_encoding: Possible encoding provided by user agent.
:type likely_encoding: str or bytes
:param default_encoding: Encoding used as fallback.
:type default_encoding: str or bytes
:returns: :class:`xml.etree.ElementTree.Element`.
Example:
>>> from tinyhtml5 import parse
>>> parse('
"""
return HTMLParser(namespace_html_elements).parse(document, **kwargs)
class HTMLParser:
"""HTML parser.
Generate a tree structure from a stream of (possibly malformed) HTML.
"""
def __init__(self, namespace_html_elements=True):
self.tree = TreeBuilder(namespace_html_elements)
self.errors = []
self.phases = {name: cls(self, self.tree) for name, cls in _phases.items()}
def _parse(self, stream, container=None, scripting=False, **kwargs):
self.container = container
self.scripting = scripting
self.tokenizer = HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()
try:
self.main_loop()
except ReparseError:
self.reset()
self.main_loop()
def reset(self):
self.tree.reset()
self.first_start_tag = False
self.errors = []
self.compatibility_mode = "no quirks" # or "quirks" or "limited quirks"
if self.container:
if self.container in cdata_elements:
self.tokenizer.state = self.tokenizer.rcdata_state
elif self.container in rcdata_elements:
self.tokenizer.state = self.tokenizer.rawtext_state
elif self.container == 'plaintext':
self.tokenizer.state = self.tokenizer.plaintext_state
else:
# State already is data state.
# self.tokenizer.state = self.tokenizer.data_state
pass
self.phase = self.phases["before html"]
self.phase._insert_html_element()
self.reset_insertion_mode()
else:
self.phase = self.phases["initial"]
self.last_phase = None
self.before_rcdata_phase = None
self.frameset_ok = True
@property
def encoding(self):
"""Name of the character encoding that was used to decode the input stream.
:obj:`None` if that is not determined yet.
"""
if hasattr(self, 'tokenizer'):
return self.tokenizer.stream.encoding[0].name
def is_html_integration_point(self, element):
full_name = (element.namespace, element.name)
if full_name == (namespaces["mathml"], "annotation-xml"):
return (
"encoding" in element.attributes and
element.attributes["encoding"].translate(ascii_upper_to_lower) in
("text/html", "application/xhtml+xml"))
return full_name in html_integration_point_elements
def is_mathml_text_integration_point(self, element):
full_name = (element.namespace, element.name)
return full_name in mathml_text_integration_point_elements
def main_loop(self):
for token in self.tokenizer:
previous_token = None
new_token = token
while new_token is not None:
previous_token = new_token
current_node = (
self.tree.open_elements[-1] if self.tree.open_elements else None)
current_node_namespace = (
current_node.namespace if current_node else None)
current_node_name = current_node.name if current_node else None
type = new_token["type"]
if type == Token.PARSE_ERROR:
self.parse_error(new_token["data"], new_token.get("datavars", {}))
new_token = None
else:
if (len(self.tree.open_elements) == 0 or
current_node_namespace == self.tree.default_namespace or
(self.is_mathml_text_integration_point(current_node) and
((type == Token.START_TAG and
token["name"] not in frozenset(["mglyph", "malignmark"])) or
type in (Token.CHARACTERS, Token.SPACE_CHARACTERS))) or
(current_node_namespace == namespaces["mathml"] and
current_node_name == "annotation-xml" and
type == Token.START_TAG and
token["name"] == "svg") or
(self.is_html_integration_point(current_node) and type in (
Token.START_TAG, Token.CHARACTERS,
Token.SPACE_CHARACTERS))):
phase = self.phase
else:
phase = self.phases["in foreign content"]
if type == Token.CHARACTERS:
new_token = phase.process_characters(new_token)
elif type == Token.SPACE_CHARACTERS:
new_token = phase.process_space_characters(new_token)
elif type == Token.START_TAG:
new_token = phase.process_start_tag(new_token)
elif type == Token.END_TAG:
new_token = phase.process_end_tag(new_token)
elif type == Token.COMMENT:
new_token = phase.process_comment(new_token)
elif type == Token.DOCTYPE:
new_token = phase.process_doctype(new_token)
if (type == Token.START_TAG and previous_token["selfClosing"] and
not previous_token["selfClosingAcknowledged"]):
self.parse_error(
"non-void-element-with-trailing-solidus",
{"name": previous_token["name"]})
# When the loop finishes it's EOF.
reprocess = True
phases = []
while reprocess:
phases.append(self.phase)
reprocess = self.phase.process_eof()
if reprocess:
assert self.phase not in phases
def parse(self, stream, full_tree=False, **kwargs):
"""Parse a HTML document into a well-formed tree.
If ``full_tree`` is ``True``, return the whole tree.
"""
self._parse(stream, **kwargs)
return self.tree.get_document(full_tree)
def parse_fragment(self, stream, container="div", **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment.
``container`` is the tag name of the fragment’s container.
"""
self._parse(stream, container=container, **kwargs)
return self.tree.get_fragment()
def parse_error(self, errorcode, datavars=None):
if datavars is None:
datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
def adjust_mathml_attributes(self, token):
adjust_attributes(token, adjust_mathml_attributes)
def adjust_svg_attributes(self, token):
adjust_attributes(token, adjust_svg_attributes)
def adjust_foreign_attributes(self, token):
adjust_attributes(token, adjust_foreign_attributes)
def reset_insertion_mode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
new_modes = {
"select": "in select",
"td": "in cell",
"th": "in cell",
"tr": "in row",
"tbody": "in table body",
"thead": "in table body",
"tfoot": "in table body",
"caption": "in caption",
"colgroup": "in column group",
"table": "in table",
"head": "in body",
"body": "in body",
"frameset": "in frameset",
"html": "before head"
}
for node in self.tree.open_elements[::-1]:
node_name = node.name
new_phase = None
if node == self.tree.open_elements[0]:
assert self.container
last = True
node_name = self.container
# Check for conditions that should only happen in the fragment case.
if node_name in ("select", "colgroup", "head", "html"):
assert self.container
if not last and node.namespace != self.tree.default_namespace:
continue
if node_name in new_modes:
new_phase = self.phases[new_modes[node_name]]
break
elif last:
new_phase = self.phases["in body"]
break
self.phase = new_phase
def parse_rcdata_rawtext(self, token, content_type):
# Generic RCDATA/RAWTEXT Parsing algorithm.
assert content_type in ("RAWTEXT", "RCDATA")
self.tree.insert_element(token)
if content_type == "RAWTEXT":
self.tokenizer.state = self.tokenizer.rawtext_state
else:
self.tokenizer.state = self.tokenizer.rcdata_state
self.original_phase = self.phase
self.phase = self.phases["text"]
def dispatch(items):
return {
key: value
for keys, value in items
for key in ((keys,) if isinstance(keys, str) else keys)
}
class Phase:
"""Base class for helper that implements each phase of processing."""
__slots__ = ("parser", "tree", "__start_tag_cache", "__end_tag_cache")
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
self.__start_tag_cache = {}
self.__end_tag_cache = {}
def process_eof(self): # pragma: no cover
raise NotImplementedError
def process_comment(self, token):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insert_comment(token, self.tree.open_elements[-1])
def process_doctype(self, token):
self.parser.parse_error("unexpected-doctype")
def process_characters(self, token):
self.tree.insert_text(token["data"])
def process_space_characters(self, token):
self.tree.insert_text(token["data"])
def process_start_tag(self, token):
name = token["name"]
# In Py3, `in` is quicker when there are few cache hits (typically
# short inputs).
if name in self.__start_tag_cache:
function = self.__start_tag_cache[name]
else:
function = self.__start_tag_cache[name] = self.start_tag_handler.get(
name, type(self).start_tag_other)
# Bound the cache size in case we get loads of unknown tags.
while len(self.__start_tag_cache) > len(self.start_tag_handler) * 1.1:
# This makes the eviction policy random on Py < 3.7 and FIFO >= 3.7.
self.__start_tag_cache.pop(next(iter(self.__start_tag_cache)))
return function(self, token)
def start_tag_html(self, token):
if not self.parser.first_start_tag and token["name"] == "html":
self.parser.parse_error("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parse_error().
for attr, value in token["data"].items():
if attr not in self.tree.open_elements[0].attributes:
self.tree.open_elements[0].attributes[attr] = value
self.parser.first_start_tag = False
def process_end_tag(self, token):
name = token["name"]
# In Py3, `in` is quicker when there are few cache hits (typically
# short inputs).
if name in self.__end_tag_cache:
function = self.__end_tag_cache[name]
else:
function = self.__end_tag_cache[name] = self.end_tag_handler.get(
name, type(self).end_tag_other)
# Bound the cache size in case we get loads of unknown tags.
while len(self.__end_tag_cache) > len(self.end_tag_handler) * 1.1:
# This makes the eviction policy random on Py < 3.7 and FIFO >= 3.7.
self.__end_tag_cache.pop(next(iter(self.__end_tag_cache)))
return function(self, token)
class InitialPhase(Phase):
__slots__ = tuple()
def process_space_characters(self, token):
pass
def process_comment(self, token):
self.tree.insert_comment(token, self.tree.document)
def process_doctype(self, token):
name = token["name"]
public_id = token["publicId"]
system_id = token["systemId"]
correct = token["correct"]
if (name != "html" or public_id is not None or
system_id is not None and system_id != "about:legacy-compat"):
self.parser.parse_error("unknown-doctype")
if public_id is None:
public_id = ""
self.tree.insert_doctype(token)
if public_id != "":
public_id = public_id.translate(ascii_upper_to_lower)
if (not correct or token["name"] != "html" or
public_id.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::"
"extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::"
"extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) or
public_id in ("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
"html") or
public_id.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
system_id is None or
system_id and system_id.lower() ==
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatibility_mode = "quirks"
elif (public_id.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
"-//w3c//dtd xhtml 1.0 transitional//")) or
public_id.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
system_id is not None):
self.parser.compatibility_mode = "limited quirks"
self.parser.phase = self.parser.phases["before html"]
def anything_else(self):
self.parser.compatibility_mode = "quirks"
self.parser.phase = self.parser.phases["before html"]
def process_characters(self, token):
self.parser.parse_error("expected-doctype-but-got-chars")
self.anything_else()
return token
def process_start_tag(self, token):
self.parser.parse_error(
"expected-doctype-but-got-start-tag", {"name": token["name"]})
self.anything_else()
return token
def process_end_tag(self, token):
self.parser.parse_error(
"expected-doctype-but-got-end-tag", {"name": token["name"]})
self.anything_else()
return token
def process_eof(self):
self.parser.parse_error("expected-doctype-but-got-eof")
self.anything_else()
return True
class BeforeHtmlPhase(Phase):
__slots__ = tuple()
def _insert_html_element(self):
self.tree.insert_root(implied_tag_token("html", "START_TAG"))
self.parser.phase = self.parser.phases["before head"]
def process_eof(self):
self._insert_html_element()
return True
def process_comment(self, token):
self.tree.insert_comment(token, self.tree.document)
def process_space_characters(self, token):
pass
def process_characters(self, token):
self._insert_html_element()
return token
def process_start_tag(self, token):
if token["name"] == "html":
self.parser.first_start_tag = True
self._insert_html_element()
return token
def process_end_tag(self, token):
if token["name"] not in ("head", "body", "html", "br"):
self.parser.parse_error(
"unexpected-end-tag-before-html", {"name": token["name"]})
else:
self._insert_html_element()
return token
class BeforeHeadPhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return True
def process_space_characters(self, token):
pass
def process_characters(self, token):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_head(self, token):
self.tree.insert_element(token)
self.tree.head_element = self.tree.open_elements[-1]
self.parser.phase = self.parser.phases["in head"]
def start_tag_other(self, token):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return token
def end_tag_imply_head(self, token):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return token
def end_tag_other(self, token):
self.parser.parse_error("end-tag-after-implied-root", {"name": token["name"]})
start_tag_handler = dispatch([
("html", start_tag_html),
("head", start_tag_head)
])
end_tag_handler = dispatch([
(("head", "body", "html", "br"), end_tag_imply_head)
])
class InHeadPhase(Phase):
__slots__ = tuple()
# the real thing
def process_eof(self):
self.anything_else()
return True
def process_characters(self, token):
self.anything_else()
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_head(self, token):
self.parser.parse_error("two-heads-are-not-better-than-one")
def start_tag_base_link_command(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def start_tag_meta(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
attributes = token["data"]
if self.parser.tokenizer.stream.encoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.change_encoding(attributes["charset"])
elif ("content" in attributes and
"http-equiv" in attributes and
attributes["http-equiv"].lower() == "content-type"):
# Encoding it as UTF-8 here is a hack, as really we should pass
# the abstract Unicode string, and just use the
# ContentAttributeParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = inputstream.ContentAttributeParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.change_encoding(codec)
def start_tag_title(self, token):
self.parser.parse_rcdata_rawtext(token, "RCDATA")
def start_tag_noframes_style(self, token):
# Need to decide whether to implement the scripting-disabled case
self.parser.parse_rcdata_rawtext(token, "RAWTEXT")
def start_tag_noscript(self, token):
if self.parser.scripting:
self.parser.parse_rcdata_rawtext(token, "RAWTEXT")
else:
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in head noscript"]
def start_tag_script(self, token):
self.tree.insert_element(token)
self.parser.tokenizer.state = self.parser.tokenizer.script_data_state
self.parser.original_phase = self.parser.phase
self.parser.phase = self.parser.phases["text"]
def start_tag_other(self, token):
self.anything_else()
return token
def end_tag_head(self, token):
node = self.parser.tree.open_elements.pop()
assert node.name == "head", "Expected head got %s" % node.name
self.parser.phase = self.parser.phases["after head"]
def end_tag_html_body_br(self, token):
self.anything_else()
return token
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def anything_else(self):
self.end_tag_head(implied_tag_token("head"))
start_tag_handler = dispatch([
("html", start_tag_html),
("title", start_tag_title),
(("noframes", "style"), start_tag_noframes_style),
("noscript", start_tag_noscript),
("script", start_tag_script),
(("base", "basefont", "bgsound", "command", "link"),
start_tag_base_link_command),
("meta", start_tag_meta),
("head", start_tag_head)
])
end_tag_handler = dispatch([
("head", end_tag_head),
(("br", "html", "body"), end_tag_html_body_br)
])
class InHeadNoscriptPhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.parser.parse_error("eof-in-head-noscript")
self.anything_else()
return True
def process_comment(self, token):
return self.parser.phases["in head"].process_comment(token)
def process_characters(self, token):
self.parser.parse_error("char-in-head-noscript")
self.anything_else()
return token
def process_space_characters(self, token):
return self.parser.phases["in head"].process_space_characters(token)
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_base_link_command(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_head_noscript(self, token):
self.parser.parse_error("unexpected-start-tag", {"name": token["name"]})
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anything_else()
return token
def end_tag_noscript(self, token):
node = self.parser.tree.open_elements.pop()
assert node.name == "noscript", f"Expected noscript got {node.name}"
self.parser.phase = self.parser.phases["in head"]
def end_tag_br(self, token):
self.parser.parse_error(
"unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anything_else()
return token
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def anything_else(self):
# Caller must raise parse error first!
self.end_tag_noscript(implied_tag_token("noscript"))
start_tag_handler = dispatch([
("html", start_tag_html),
(("basefont", "bgsound", "link", "meta", "noframes", "style"),
start_tag_base_link_command),
(("head", "noscript"), start_tag_head_noscript),
])
end_tag_handler = dispatch([
("noscript", end_tag_noscript),
("br", end_tag_br),
])
class AfterHeadPhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.anything_else()
return True
def process_characters(self, token):
self.anything_else()
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_body(self, token):
self.parser.frameset_ok = False
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in body"]
def start_tag_frameset(self, token):
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in frameset"]
def start_tag_from_head(self, token):
self.parser.parse_error(
"unexpected-start-tag-out-of-my-head", {"name": token["name"]})
self.tree.open_elements.append(self.tree.head_element)
self.parser.phases["in head"].process_start_tag(token)
for node in self.tree.open_elements[::-1]:
if node.name == "head":
self.tree.open_elements.remove(node)
break
def start_tag_head(self, token):
self.parser.parse_error("unexpected-start-tag", {"name": token["name"]})
def start_tag_other(self, token):
self.anything_else()
return token
def end_tag_html_body_br(self, token):
self.anything_else()
return token
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def anything_else(self):
self.tree.insert_element(implied_tag_token("body", "START_TAG"))
self.parser.phase = self.parser.phases["in body"]
self.parser.frameset_ok = True
start_tag_handler = dispatch([
("html", start_tag_html),
("body", start_tag_body),
("frameset", start_tag_frameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"), start_tag_from_head),
("head", start_tag_head)
])
end_tag_handler = dispatch([
(("body", "html", "br"), end_tag_html_body_br)
])
class InBodyPhase(Phase):
# https://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# The really-really-really-very crazy mode.
__slots__ = ("process_space_characters",)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Set this to the default handler.
self.process_space_characters = self.process_space_characters_non_pre
def is_matching_formatting_element(self, node1, node2):
return (
node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)
def add_formatting_element(self, token):
self.tree.insert_element(token)
element = self.tree.open_elements[-1]
matching_elements = []
for node in self.tree.active_formatting_elements[::-1]:
if node is Marker:
break
elif self.is_matching_formatting_element(node, element):
matching_elements.append(node)
assert len(matching_elements) <= 3
if len(matching_elements) == 3:
self.tree.active_formatting_elements.remove(matching_elements[-1])
self.tree.active_formatting_elements.append(element)
# The real deal.
def process_eof(self):
allowed_elements = frozenset((
"dd", "dt", "li", "p",
"tbody", "td", "tfoot", "th", "thead", "tr",
"body", "html"))
for node in self.tree.open_elements[::-1]:
if node.name not in allowed_elements:
self.parser.parse_error("expected-closing-tag-but-got-eof")
break
# Stop parsing.
def process_space_characters_drop_newline(self, token):
# Sometimes (start of , , and