tractatus/.venv-docs/lib/python3.12/site-packages/tinyhtml5/treebuilder.py
TheFlow ac2db33732 fix(submissions): restructure Economist package and fix article display
- Create Economist SubmissionTracking package correctly:
  * mainArticle = full blog post content
  * coverLetter = 216-word SIR— letter
  * Links to blog post via blogPostId
- Archive 'Letter to The Economist' from blog posts (it's the cover letter)
- Fix date display on article cards (use published_at)
- Target publication already displaying via blue badge

Database changes:
- Make blogPostId optional in SubmissionTracking model
- Economist package ID: 68fa85ae49d4900e7f2ecd83
- Le Monde package ID: 68fa2abd2e6acd5691932150

Next: Enhanced modal with tabs, validation, export

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-24 08:47:42 +13:00

449 lines
16 KiB
Python

"""Tree builder."""
from copy import copy
from xml.etree import ElementTree
from .constants import namespaces, scoping_elements, table_insert_mode_elements
# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, object elements, and marquees.
Marker = None
_html = namespaces["html"]
_list_elements = {
None: (frozenset(scoping_elements), False),
"button": (frozenset(scoping_elements | {(_html, "button")}), False),
"list": (frozenset(scoping_elements | {(_html, "ol"), (_html, "ul")}), False),
"table": (frozenset([(_html, "html"), (_html, "table")]), False),
"select": (frozenset([(_html, "optgroup"), (_html, "option")]), True),
}
# XXX td, th and tr are not actually needed.
_implied_end_tags = frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
class ActiveFormattingElements(list):
def append(self, node):
"""Append node to the end of the list."""
equal_count = 0
if node is not Marker:
for element in self[::-1]:
if element is Marker:
break
nodes_equal = (
element.name_tuple == node.name_tuple and
element.attributes == node.attributes)
if nodes_equal:
equal_count += 1
if equal_count == 3:
self.remove(element)
break
list.append(self, node)
class Element:
def __init__(self, name, namespace=None):
self.name = name
self.namespace = namespace
self._element = ElementTree.Element(self._get_etree_tag(name, namespace))
if namespace is None:
self.name_tuple = _html, self.name
else:
self.name_tuple = self.namespace, self.name
self._children = []
# The parent of the current node (or None for the document node).
self.parent = None
def _get_etree_tag(self, name, namespace):
return name if namespace is None else f"{{{namespace}}}{name}"
def _get_attributes(self):
return self._element.attrib
def _set_attributes(self, attributes):
element_attributes = self._element.attrib
element_attributes.clear()
if attributes:
# Calling .items _always_ allocates, and the above truthy check is
# cheaper than the allocation on average.
for key, value in attributes.items():
name = f"{{{key[2]}}}{key[1]}" if isinstance(key, tuple) else key
element_attributes[name] = value
# A dict holding name -> value pairs for attributes of the node.
attributes = property(_get_attributes, _set_attributes)
def _get_children(self):
return self._children
def _set_children(self, value):
del self._element[:]
self._children = []
for element in value:
self.insert_child(element)
# A list of child nodes of the current node. This must include all
# elements but not necessarily other node types.
children = property(_get_children, _set_children)
def has_content(self):
"""Return True if the node has children or text, False otherwise."""
return bool(self._element.text or len(self._element))
def append_child(self, node):
"""Insert node as a child of the current node."""
self._children.append(node)
self._element.append(node._element)
node.parent = self
def insert_before(self, node, reference):
"""Insert node as a child of the current node, before reference.
Raise ValueError if reference is not a child of the current node.
"""
index = list(self._element).index(reference._element)
self._element.insert(index, node._element)
node.parent = self
def remove_child(self, node):
"""Remove node from the children of the current node."""
self._children.remove(node)
self._element.remove(node._element)
node.parent = None
def insert_text(self, text, insert_before=None):
"""Insert data as text in the current node.
Text is positioned before the start of node insert_before or to the end
of the node's text.
If insert_before is a node, insert the text before this node.
"""
if not len(self._element):
if not self._element.text:
self._element.text = ""
self._element.text += text
elif insert_before is None:
# Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += text
else:
# Insert the text before the specified node
children = list(self._element)
index = children.index(insert_before._element)
if index > 0:
if not self._element[index - 1].tail:
self._element[index - 1].tail = ""
self._element[index - 1].tail += text
else:
if not self._element.text:
self._element.text = ""
self._element.text += text
def clone(self):
"""Return a shallow copy of the current node.
The node has the same name and attributes, but no parent or children.
"""
element = type(self)(self.name, self.namespace)
if self._element.attrib:
element._element.attrib = copy(self._element.attrib)
return element
def reparent_children(self, parent):
"""Move all the children of the current node to parent.
This is needed so that trees that don't store text as nodes move the
text in the correct way.
"""
if parent.children:
parent.children[-1]._element.tail += self._element.text
else:
if not parent._element.text:
parent._element.text = ""
if self._element.text is not None:
parent._element.text += self._element.text
self._element.text = ""
for child in self.children:
parent.append_child(child)
self.children = []
class Comment(Element):
def __init__(self, data):
# Use the superclass constructor to set all properties on the
# wrapper element
self._element = ElementTree.Comment(data)
self.parent = None
self._children = []
class DocumentType(Element):
def __init__(self, name, public_id, system_id):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self._element.set("publicId", public_id)
self._element.set("systemId", system_id)
self.public_id = public_id
self.system_id = system_id
class Document(Element):
def __init__(self):
Element.__init__(self, "DOCUMENT_ROOT")
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, "DOCUMENT_FRAGMENT")
class TreeBuilder:
"""Tree builder."""
def __init__(self, namespace_html_elements):
"""Create a TreeBuilder.
If namespace_html_elements is True, namespace HTML elements.
"""
if namespace_html_elements:
self.default_namespace = "http://www.w3.org/1999/xhtml"
else:
self.default_namespace = None
self.reset()
def reset(self):
self.open_elements = []
self.active_formatting_elements = ActiveFormattingElements()
self.head_element = None
self.form_element = None
self.insert_from_table = False
self.document = Document()
def element_in_scope(self, target, variant=None):
# If we pass a node in we match that. If we pass a string
# match any node with that name.
exact_node = hasattr(target, "name_tuple")
if not exact_node:
if isinstance(target, str):
target = (_html, target)
assert isinstance(target, tuple)
list_elements, invert = _list_elements[variant]
for node in reversed(self.open_elements):
if exact_node and node == target:
return True
elif not exact_node and node.name_tuple == target:
return True
elif (invert ^ (node.name_tuple in list_elements)):
return False
# We should never reach this point.
assert False # pragma: no cover
def reconstruct_active_formatting_elements(self):
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
if not self.active_formatting_elements:
return
# Step 2 and step 3: we start with the last element. So i is -1.
i = len(self.active_formatting_elements) - 1
entry = self.active_formatting_elements[i]
if entry is Marker or entry in self.open_elements:
return
# Step 6.
while entry is not Marker and entry not in self.open_elements:
if i == 0:
# This will be reset to 0 below.
i = -1
break
i -= 1
# Step 5: let entry be one earlier in the list.
entry = self.active_formatting_elements[i]
while True:
# Step 7.
i += 1
# Step 8.
entry = self.active_formatting_elements[i]
clone = entry.clone() # mainly to get a new copy of the attributes
# Step 9.
element = self.insert_element({
"type": "StartTag",
"name": clone.name,
"namespace": clone.namespace,
"data": clone.attributes,
})
# Step 10.
self.active_formatting_elements[i] = element
# Step 11.
if element == self.active_formatting_elements[-1]:
break
def clear_active_formatting_elements(self):
entry = self.active_formatting_elements.pop()
while self.active_formatting_elements and entry is not Marker:
entry = self.active_formatting_elements.pop()
def element_in_active_formatting_elements(self, name):
"""Find name between end of active formatting elements and last marker.
If an element with this name exists, return it. Else return False.
"""
for item in self.active_formatting_elements[::-1]:
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
if item is Marker:
break
elif item.name == name:
return item
return False
def insert_root(self, token):
element = self.create_element(token)
self.open_elements.append(element)
self.document.append_child(element)
def insert_doctype(self, token):
name = token["name"]
public_id = token["publicId"]
system_id = token["systemId"]
doctype = DocumentType(name, public_id, system_id)
self.document.append_child(doctype)
def insert_comment(self, token, parent):
parent.append_child(Comment(token["data"]))
def create_element(self, token):
"""Create an element but don't insert it anywhere."""
name = token["name"]
namespace = token.get("namespace", self.default_namespace)
element = Element(name, namespace)
element.attributes = token["data"]
return element
def _get_insert_from_table(self):
return self._insert_from_table
def _set_insert_from_table(self, value):
"""Switch the function used to insert an element."""
self._insert_from_table = value
if value:
self.insert_element = self.insert_element_table
else:
self.insert_element = self.insert_element_normal
insert_from_table = property(_get_insert_from_table, _set_insert_from_table)
def insert_element_normal(self, token):
name = token["name"]
assert isinstance(name, str), f"Element {name} not unicode"
namespace = token.get("namespace", self.default_namespace)
element = Element(name, namespace)
element.attributes = token["data"]
self.open_elements[-1].append_child(element)
self.open_elements.append(element)
return element
def insert_element_table(self, token):
"""Create an element and insert it into the tree."""
element = self.create_element(token)
if self.open_elements[-1].name not in table_insert_mode_elements:
return self.insert_element_normal(token)
else:
# We should be in the InTable mode. This means we want to do
# special magic element rearranging.
parent, insert_before = self.get_table_misnested_node_position()
if insert_before is None:
parent.append_child(element)
else:
parent.insert_before(element, insert_before)
self.open_elements.append(element)
return element
def insert_text(self, data, parent=None):
"""Insert text data."""
if parent is None:
parent = self.open_elements[-1]
in_table = (
self.insert_from_table and
self.open_elements[-1].name in table_insert_mode_elements)
if in_table:
# We should be in the InTable mode. This means we want to do
# special magic element rearranging.
parent, insert_before = self.get_table_misnested_node_position()
parent.insert_text(data, insert_before)
else:
parent.insert_text(data)
def get_table_misnested_node_position(self):
"""Get foster parent element and sibling (or None) to insert before."""
# The foster parent element is the one which comes before the most
# recently opened table element.
# XXX - this is really inelegant.
last_table = None
foster_parent = None
insert_before = None
for element in self.open_elements[::-1]:
if element.name == "table":
last_table = element
break
if last_table:
# XXX - we should really check that this parent is actually a
# node here.
if last_table.parent:
foster_parent = last_table.parent
insert_before = last_table
else:
index = self.open_elements.index(last_table) - 1
foster_parent = self.open_elements[index]
else:
foster_parent = self.open_elements[0]
return foster_parent, insert_before
def generate_implied_end_tags(self, exclude=None):
name = self.open_elements[-1].name
if name in _implied_end_tags and name != exclude:
self.open_elements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generate_implied_end_tags(exclude)
def get_document(self, full_tree=False):
"""Return the final tree."""
if full_tree:
return self.document._element
return self.document._element.find(
"html" if self.default_namespace is None else
f"{{{self.default_namespace}}}html")
def get_fragment(self):
"""Return the final fragment."""
fragment = DocumentFragment()
self.open_elements[0].reparent_children(fragment)
return fragment._element