"""Tests to ensure that the html5lib tree builder generates good trees.""" import pytest import warnings from bs4 import BeautifulSoup from bs4.filter import SoupStrainer from . import ( HTML5LIB_PRESENT, HTML5TreeBuilderSmokeTest, ) @pytest.mark.skipif( not HTML5LIB_PRESENT, reason="html5lib seems not to be present, not testing its tree builder.", ) class TestHTML5LibBuilder(HTML5TreeBuilderSmokeTest): """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): from bs4.builder import HTML5TreeBuilder return HTML5TreeBuilder def test_soupstrainer(self): # The html5lib tree builder does not support parse_only. strainer = SoupStrainer("b") markup = "

A bold statement.

" with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup(markup, "html5lib", parse_only=strainer) assert soup.decode() == self.document_for(markup) [warning] = w assert warning.filename == __file__ assert "the html5lib tree builder doesn't support parse_only" in str( warning.message ) def test_correctly_nested_tables(self): """html5lib inserts tags where other parsers don't.""" markup = ( '' "" "" ) self.assert_soup( markup, '

Here's another table:" '' "" "

foo

Here\'s another table:' '

foo

' "

", ) self.assert_soup( "" "" "

Foo
Bar
Baz

" ) def test_xml_declaration_followed_by_doctype(self): markup = """

foo

""" soup = self.soup(markup) # Verify that we can reach the

tag; this means the tree is connected. assert b"

foo

" == soup.p.encode() def test_reparented_markup(self): markup = "

foo

bar

" soup = self.soup(markup) assert ( "

foo

bar

" == soup.body.decode() ) assert 2 == len(soup.find_all("p")) def test_reparented_markup_ends_with_whitespace(self): markup = "

foo

bar

\n" soup = self.soup(markup) assert ( "

foo

bar

\n" == soup.body.decode() ) assert 2 == len(soup.find_all("p")) def test_reparented_markup_containing_identical_whitespace_nodes(self): """Verify that we keep the two whitespace nodes in this document distinct when reparenting the adjacent tags. """ markup = "

" soup = self.soup(markup) space1, space2 = soup.find_all(string=" ") tbody1, tbody2 = soup.find_all("tbody") assert space1.next_element is tbody1 assert tbody2.next_element is space2 def test_reparented_markup_containing_children(self): markup = ( "

aftermath

" ) soup = self.soup(markup) noscript = soup.noscript assert "target" == noscript.next_element target = soup.find(string="target") # The 'aftermath' string was duplicated; we want the second one. final_aftermath = soup.find_all(string="aftermath")[-1] # The

tag, # but the 'target' string within is still connected to the # (second) 'aftermath' string. assert final_aftermath == target.next_element assert target == final_aftermath.previous_element def test_processing_instruction(self): """Processing instructions become comments.""" markup = b"""""" soup = self.soup(markup) assert str(soup).startswith("") def test_cloned_multivalue_node(self): markup = b"""""" soup = self.soup(markup) a1, a2 = soup.find_all("a") assert a1 == a2 assert a1 is not a2 def test_foster_parenting(self): markup = b"""

hello \n\n\ntext%s%s%s