# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ import pytest import re import warnings from bs4 import BeautifulSoup from bs4.builder import builder_registry from bs4.element import ( AttributeResemblesVariableWarning, CData, Comment, NavigableString, Tag, ) from bs4.filter import SoupStrainer from . import ( SoupTest, ) class TestFind(SoupTest): """Basic tests of the find() method. """ def test_find_tag(self): soup = self.soup("1234") assert soup.find("b").string == "2" def test_unicode_text_find(self): soup = self.soup("
text
") assert "div" == soup.find().name assert "div" == soup.find("p").find_previous_sibling().name assert "p" == soup.find("div").find_next_sibling().name def test_find_with_function_can_only_find_tags(self): soup = self.soup("texttext
") assert "p" == soup.find(lambda t: t.name=="p").name assert None == soup.find(lambda t: t=="text") class TestFindAll(SoupTest): """Basic tests of the find_all() method.""" def test_find_all_with_no_arguments_only_finds_tags(self): soup = self.soup("texttext
") assert 2 == len(soup.body.find_all()) assert 1 == len(soup.find("p").find_previous_siblings()) assert 1 == len(soup.find("div").find_next_siblings()) def test_find_all_text_nodes(self): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. assert soup.find_all(string="bar") == ["bar"] # Match any of a number of strings. assert soup.find_all(string=["Foo", "bar"]) == ["Foo", "bar"] # Match a regular expression. assert soup.find_all(string=re.compile(".*")) == ["Foo", "bar", "\xbb"] # Match anything. assert soup.find_all(string=True) == ["Foo", "bar", "\xbb"] def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" soup = self.soup("12345") self.assert_selects(soup.find_all("a", limit=3), ["1", "2", "3"]) self.assert_selects(soup.find_all("a", limit=1), ["1"]) self.assert_selects(soup.find_all("a", limit=10), ["1", "2", "3", "4", "5"]) # A limit of 0 means no limit. self.assert_selects(soup.find_all("a", limit=0), ["1", "2", "3", "4", "5"]) def test_calling_a_tag_is_calling_findall(self): soup = self.soup("123") self.assert_selects(soup("a", limit=1), ["1"]) self.assert_selects(soup.b(id="foo"), ["3"]) def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion( self, ): soup = self.soup("") # Create a self-referential list. selfref = [] selfref.append(selfref) # Without special code in SoupStrainer, this would cause infinite # recursion. with warnings.catch_warnings(record=True) as w: assert [] == soup.find_all(selfref) [warning] = w assert warning.filename == __file__ msg = str(warning.message) assert ( msg == "Ignoring nested list [[...]] to avoid the possibility of infinite recursion." ) def test_find_all_resultset(self): """All find_all calls return a ResultSet""" soup = self.soup("") result = soup.find_all("a") assert hasattr(result, "source") result = soup.find_all(True) assert hasattr(result, "source") result = soup.find_all(string="foo") assert hasattr(result, "source") class TestFindAllBasicNamespaces(SoupTest): def test_find_by_namespaced_name(self): soup = self.soup('Don't leave me here.
Don\'t leave!
""" soup = self.soup(doc) second_para = soup.find(id="2") bold = soup.b # Move the tag to the end of the second paragraph. soup.find(id="2").append(soup.b) # The tag is now a child of the second paragraph. assert bold.parent == second_para assert soup.decode() == self.document_for( 'Don\'t leave me .
\n' 'Don\'t leave!here
' ) def test_insertion_returns_inserted_things(self): soup = self.soup("") html = soup.find('html') head = html.append(soup.new_tag('head')) assert head.name == 'head' [title] = head.insert(0, soup.new_tag('title')) assert title.name == 'title' text5 = title.append('5') assert text5 == '5' text34 = text5.insert_before('3', '4') assert text34 == ['3', '4'] text67 = text5.insert_after('6', '7') assert text67 == ['6', '7'] text89 = title.extend(['8', '9']) assert text89 == ['8', '9'] assert title.get_text() == '3456789' def test_replace_with_returns_thing_that_was_replaced(self): text = "And now, a word:
And we're back.
") p2, p3 = soup.insert(1, soup.new_tag("p", string="p2"), soup.new_tag("p", string="p3")) assert "p2" == p2.string assert "p3" == p3.string p1, p2, p3, p4 = list(soup.children) assert "And now, a word:" == p1.string assert "p2" == p2.string assert "p3" == p3.string assert "And we're back." == p4.string def test_insert_beautifulsoup_object_inserts_children(self): """Inserting one BeautifulSoup object into another actually inserts all of its children -- you'll never combine BeautifulSoup objects. """ soup = self.soup("And now, a word:
And we're back.
") text = "p2
p3
" to_insert = self.soup(text) p2, p3 = soup.insert(1, to_insert) assert "p2" == p2.string assert "p3" == p3.string for i in soup.descendants: assert not isinstance(i, BeautifulSoup) p1, p2, p3, p4 = list(soup.children) assert "And now, a word:" == p1.string assert "p2" == p2.string assert "p3" == p3.string assert "And we're back." == p4.string def test_replace_with_maintains_next_element_throughout(self): soup = self.soup("onethree
") a = soup.a # Make it so the tag has two text children. a.insert(1, "two") # Now replace each one with the empty string. left, right = a.contents left.replace_with("") right.replace_with("") # The tag is still connected to the tree. assert "three" == soup.b.string def test_replace_final_node(self): soup = self.soup("Argh!") soup.find(string="Argh!").replace_with("Hooray!") new_text = soup.find(string="Hooray!") b = soup.b assert new_text.previous_element == b assert new_text.parent == b assert new_text.previous_element.next_element == new_text assert new_text.next_element is None def test_consecutive_text_nodes(self): # A builder should never create two consecutive text nodes, # but if you insert one next to another, Beautiful Soup will # handle it correctly. soup = self.soup("Argh!There's no business like show business
") no, show = soup.find_all("b") show.replace_with(no) assert soup.decode() == self.document_for( "There's business like no business
" ) assert show.parent is None assert no.parent == soup.p assert no.next_element == "no" assert no.next_sibling == " business" def test_replace_with_errors(self): # Can't replace a tag that's not part of a tree. a_tag = Tag(name="a") with pytest.raises(ValueError): a_tag.replace_with("won't work") # Can't replace a tag with its parent. a_tag = self.soup("").a with pytest.raises(ValueError): a_tag.b.replace_with(a_tag) # Or with a list that includes its parent. with pytest.raises(ValueError): a_tag.b.replace_with("string1", a_tag, "string2") def test_replace_with_multiple(self): data = "Unneeded formatting is unneeded
""") tree.em.unwrap() assert tree.em is None assert tree.p.text == "Unneeded formatting is unneeded" def test_wrap(self): soup = self.soup("I wish I was bold.") value = soup.string.wrap(soup.new_tag("b")) assert value.decode() == "I wish I was bold." assert soup.decode() == self.document_for("I wish I was bold.") def test_wrap_extracts_tag_from_elsewhere(self): soup = self.soup("I wish I was bold.") soup.b.next_sibling.wrap(soup.b) assert soup.decode() == self.document_for("I wish I was bold.") def test_wrap_puts_new_contents_at_the_end(self): soup = self.soup("I like being bold.I wish I was bold.") soup.b.next_sibling.wrap(soup.b) assert 2 == len(soup.b.contents) assert soup.decode() == self.document_for( "I like being bold.I wish I was bold." ) def test_extract(self): soup = self.soup( 'Some content. More content.' ) assert len(soup.body.contents) == 3 extracted = soup.find(id="nav").extract() assert soup.decode() == "Some content. More content." assert extracted.decode() == ' ' # The extracted tag is now an orphan. assert len(soup.body.contents) == 2 assert extracted.parent is None assert extracted.previous_element is None assert extracted.next_element.next_element is None # The gap where the extracted tag used to be has been mended. content_1 = soup.find(string="Some content. ") content_2 = soup.find(string=" More content.") assert content_1.next_element == content_2 assert content_1.next_sibling == content_2 assert content_2.previous_element == content_1 assert content_2.previous_sibling == content_1 def test_extract_distinguishes_between_identical_strings(self): soup = self.soup("foobar") foo_1 = soup.a.string foo_2 = soup.new_string("foo") bar_2 = soup.new_string("bar") soup.a.append(foo_2) soup.b.append(bar_2) # Now there are two identical strings in the tag, and two # in the tag. Let's remove the first "foo" and the second # "bar". foo_1.extract() bar_2.extract() assert foo_2 == soup.a.string assert bar_2 == soup.b.string def test_extract_multiples_of_same_tag(self): soup = self.soup(""" """) [soup.script.extract() for i in soup.find_all("script")] assert "\n\n\n" == str(soup.body) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): soup = self.soup("\n" "hi\n" "") soup.find("body").extract() assert None is soup.find("body") def test_clear(self): """Tag.clear()""" soup = self.soup("String Italicized and another
") # clear using extract() a = soup.a soup.p.clear() assert len(soup.p.contents) == 0 assert hasattr(a, "contents") # clear using decompose() em = a.em a.clear(decompose=True) assert 0 == len(em.contents) @pytest.mark.parametrize( "method_name,expected_result", [ ( "descendants", 'child3
tag from the tree removes all of its children from the tree as well. 'child'.next_element becomes None, because 'child' is no longer in the tree, and iteration stops there. Don't do this kind of thing, is what I'm saying. ( "next_elements", '
child3
child3
child3
child3
Another para
") p1, p2 = soup.find_all("p") a = p1.a text = p1.em.string for i in [p1, p2, a, text]: assert False is i.decomposed # This sets p1 and everything beneath it to decomposed. p1.decompose() for i in [p1, a, text]: assert True is i.decomposed # p2 is unaffected. assert False is p2.decomposed def test_decompose_string(self): soup = self.soup("String 1
String 2
") div = soup.div text = div.p.string assert False is text.decomposed text.decompose() assert True is text.decomposed assert "String 2