"""Tests to ensure that the lxml tree builder generates good trees.""" import pickle import pytest import warnings from . import LXML_PRESENT, LXML_VERSION if LXML_PRESENT: from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4 import ( BeautifulStoneSoup, ) from . import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, SOUP_SIEVE_PRESENT, ) @pytest.mark.skipif( not LXML_PRESENT, reason="lxml seems not to be present, not testing its tree builder.", ) class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilder def test_out_of_range_entity(self): self.assert_soup("

foo�bar

", "

foobar

") self.assert_soup("

foo�bar

", "

foobar

") self.assert_soup("

foo�bar

", "

foobar

") def test_entities_in_foreign_document_encoding(self): # We can't implement this case correctly because by the time we # hear about markup like "“", it's been (incorrectly) converted into # a string like u'\x93' pass # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. @pytest.mark.skipif( not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0), reason="Skipping doctype test for old version of lxml to avoid segfault.", ) def test_empty_doctype(self): soup = self.soup("") doctype = soup.contents[0] assert "" == doctype.strip() def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") assert "" == str(soup.b) [warning] = w assert warning.filename == __file__ assert "The BeautifulStoneSoup class was deprecated" in str(warning.message) def test_tracking_line_numbers(self): # The lxml TreeBuilder cannot keep track of line numbers from # the original markup. Even if you ask for line numbers, we # don't have 'em. # # However, for consistency with other parsers, Tag.sourceline # and Tag.sourcepos are always set to None, rather than being # available as an alias for find(). soup = self.soup( "\n
\n\n\ntext
", store_line_numbers=True, ) assert None is soup.p.sourceline assert None is soup.p.sourcepos @pytest.mark.skipif( not LXML_PRESENT, reason="lxml seems not to be present, not testing its XML tree builder.", ) class TestLXMLXMLTreeBuilder(XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilderForXML def test_namespace_indexing(self): soup = self.soup( '\n' "" 'content' 'content' '' '' '' "" "" ) # The BeautifulSoup object includes every namespace prefix # defined in the entire document. This is the default set of # namespaces used by soupsieve. # # Un-prefixed namespaces are not included, and if a given # prefix is defined twice, only the first prefix encountered # in the document shows up here. assert soup._namespaces == { "xml": "http://www.w3.org/XML/1998/namespace", "prefix": "http://prefixed-namespace.com", "prefix2": "http://another-namespace.com", } # A Tag object includes only the namespace prefixes # that were in scope when it was parsed. # We do not track un-prefixed namespaces as we can only hold # one (the first one), and it will be recognized as the # default namespace by soupsieve, even when operating from a # tag with a different un-prefixed namespace. assert soup.tag._namespaces == { "xml": "http://www.w3.org/XML/1998/namespace", } assert soup.tag2._namespaces == { "prefix": "http://prefixed-namespace.com", "xml": "http://www.w3.org/XML/1998/namespace", } assert soup.subtag._namespaces == { "prefix2": "http://another-namespace.com", "xml": "http://www.w3.org/XML/1998/namespace", } assert soup.subsubtag._namespaces == { "prefix2": "http://another-namespace.com", "xml": "http://www.w3.org/XML/1998/namespace", } @pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed") def test_namespace_interaction_with_select_and_find(self): # Demonstrate how namespaces interact with select* and # find* methods. soup = self.soup( '\n' "" 'content' 'content' '' "" "" "" ) # soupselect uses namespace URIs. assert soup.select_one("tag").name == "tag" assert soup.select_one("prefix|tag2").name == "tag2" # If a prefix is declared more than once, only the first usage # is registered with the BeautifulSoup object. assert soup.select_one("prefix|tag3") is None # But you can always explicitly specify a namespace dictionary. assert ( soup.select_one("prefix|tag3", namespaces=soup.subtag._namespaces).name == "tag3" ) # And a Tag (as opposed to the BeautifulSoup object) will # have a set of default namespaces scoped to that Tag. assert soup.subtag.select_one("prefix|tag3").name == "tag3" # the find() methods aren't fully namespace-aware; they just # look at prefixes. assert soup.find("tag").name == "tag" assert soup.find("prefix:tag2").name == "tag2" assert soup.find("prefix:tag3").name == "tag3" assert soup.subtag.find("prefix:tag3").name == "tag3" def test_pickle_restores_builder(self): # The lxml TreeBuilder is not picklable, so when unpickling # a document created with it, a new TreeBuilder of the # appropriate class is created. soup = self.soup("some markup") assert isinstance(soup.builder, self.default_builder) pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.a.string assert unpickled.builder != soup.builder assert isinstance(unpickled.builder, self.default_builder)