import pytest from bs4.element import Tag from bs4.formatter import ( Formatter, HTMLFormatter, XMLFormatter, ) from . import SoupTest class TestFormatter(SoupTest): def test_default_attributes(self): # Test the default behavior of Formatter.attributes(). formatter = Formatter() tag = Tag(name="tag") tag["b"] = "1" tag["a"] = "2" # Attributes come out sorted by name. In Python 3, attributes # normally come out of a dictionary in the order they were # added. assert [("a", "2"), ("b", "1")] == formatter.attributes(tag) # This works even if Tag.attrs is None, though this shouldn't # normally happen. tag.attrs = None assert [] == formatter.attributes(tag) assert " " == formatter.indent def test_sort_attributes(self): # Test the ability to override Formatter.attributes() to, # e.g., disable the normal sorting of attributes. class UnsortedFormatter(Formatter): def attributes(self, tag): self.called_with = tag for k, v in sorted(tag.attrs.items()): if k == "ignore": continue yield k, v soup = self.soup('
') formatter = UnsortedFormatter() decoded = soup.decode(formatter=formatter) # attributes() was called on thetag. It filtered out one # attribute and sorted the other two. assert formatter.called_with == soup.p assert '
' == decoded def test_empty_attributes_are_booleans(self): # Test the behavior of empty_attributes_are_booleans as well # as which Formatters have it enabled. for name in ("html", "minimal", None): formatter = HTMLFormatter.REGISTRY[name] assert False is formatter.empty_attributes_are_booleans formatter = XMLFormatter.REGISTRY[None] assert False is formatter.empty_attributes_are_booleans formatter = HTMLFormatter.REGISTRY["html5"] assert True is formatter.empty_attributes_are_booleans # Verify that the constructor sets the value. formatter = Formatter(empty_attributes_are_booleans=True) assert True is formatter.empty_attributes_are_booleans # Now demonstrate what it does to markup. for markup in ("", ''): soup = self.soup(markup) for formatter in ("html", "minimal", "xml", None): assert b'' == soup.option.encode( formatter="html" ) assert b"" == soup.option.encode( formatter="html5" ) @pytest.mark.parametrize( "indent,expect", [ (None, "\n\ntext\n\n\n"), (-1, "\n\ntext\n\n\n"), (0, "\n\ntext\n\n\n"), ("", "\n\ntext\n\n\n"), (1, "\n \n text\n \n\n"), (2, "\n \n text\n \n\n"), ("\t", "\n\t\n\t\ttext\n\t\n\n"), ("abc", "\nabc\nabcabctext\nabc\n\n"), # Some invalid inputs -- the default behavior is used. (object(), "\n \n text\n \n\n"), (b"bytes", "\n \n text\n \n\n"), ], ) def test_indent(self, indent, expect): # Pretty-print a tree with a Formatter set to # indent in a certain way and verify the results. soup = self.soup("text") formatter = Formatter(indent=indent) assert soup.prettify(formatter=formatter) == expect # Pretty-printing only happens with prettify(), not # encode(). assert soup.encode(formatter=formatter) != expect def test_default_indent_value(self): formatter = Formatter() assert formatter.indent == " " @pytest.mark.parametrize("formatter,expect", [ (HTMLFormatter(indent=1), "\n a\n
\n"), (HTMLFormatter(indent=2), "\n a\n
\n"), (XMLFormatter(indent=1), "\n a\n
\n"), (XMLFormatter(indent="\t"), "\n\ta\n
\n"), ] ) def test_indent_subclasses(self, formatter, expect): soup = self.soup("a
") assert expect == soup.p.prettify(formatter=formatter) @pytest.mark.parametrize( "s,expect_html,expect_html5", [ # The html5 formatter is much less aggressive about escaping ampersands # than the html formatter. ("foo & bar", "foo & bar", "foo & bar"), ("foo&", "foo&", "foo&"), ("foo&&& bar", "foo&&& bar", "foo&&& bar"), ("x=1&y=2", "x=1&y=2", "x=1&y=2"), ("&123", "&123", "&123"), ("&abc", "&abc", "&abc"), ("foo &0 bar", "foo &0 bar", "foo &0 bar"), ("foo &lolwat bar", "foo &lolwat bar", "foo &lolwat bar"), # But both formatters escape what the HTML5 spec considers ambiguous ampersands. ("&nosuchentity;", "&nosuchentity;", "&nosuchentity;"), ], ) def test_entity_substitution(self, s, expect_html, expect_html5): assert HTMLFormatter.REGISTRY["html"].substitute(s) == expect_html assert HTMLFormatter.REGISTRY["html5"].substitute(s) == expect_html5 assert HTMLFormatter.REGISTRY["html5-4.12"].substitute(s) == expect_html def test_entity_round_trip(self): # This is more an explanatory test and a way to avoid regressions than a test of functionality. markup = "Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷
" soup = self.soup(markup) assert ( "Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷" == soup.p.string ) # Oops, I forgot to mention the entity. soup.p.string = soup.p.string + " ÷" assert ( "Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷ ÷" == soup.p.string ) expect = "Some division signs: ÷ ÷ ÷ ÷. These are made with: ÷ ÷ ÷ ÷
" assert expect == soup.p.decode(formatter="html") assert expect == soup.p.decode(formatter="html5") markup = "a & b
" soup = self.soup(markup) assert "a & b
" == soup.p.decode(formatter="html") assert "a & b
" == soup.p.decode(formatter="html5")