"""Tests to ensure that the html.parser tree builder generates good
trees."""
import pickle
import pytest
from bs4.builder._htmlparser import (
_DuplicateAttributeHandler,
BeautifulSoupHTMLParser,
HTMLParserTreeBuilder,
)
from bs4.exceptions import ParserRejectedMarkup
from typing import Any
from . import HTMLTreeBuilderSmokeTest
class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
def test_rejected_input(self):
# Python's html.parser will occasionally reject markup,
# especially when there is a problem with the initial DOCTYPE
# declaration. Different versions of Python sound the alarm in
# different ways, but Beautiful Soup consistently raises
# errors as ParserRejectedMarkup exceptions.
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://github.com/python/cpython/issues/81928
b"\n",
]
for markup in bad_markup:
with pytest.raises(ParserRejectedMarkup):
self.soup(markup)
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("foo")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
assert isinstance(loaded.builder, type(tree.builder))
def test_redundant_empty_element_closing_tags(self):
self.assert_soup("
", "
")
self.assert_soup("", "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assert_soup("foo bar", "foo &# bar")
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n
\n\n
a &nosuchentity b
" soup = self.soup(markup) assert "a &nosuchentity b
" == soup.p.decode() markup = "a &nosuchentity; b
" soup = self.soup(markup) assert "a &nosuchentity b
" == soup.p.decode()