import pytest
import types
from bs4 import (
BeautifulSoup,
ResultSet,
)
from typing import (
Any,
List,
Tuple,
Type,
)
from packaging.version import Version
from . import (
SoupTest,
SOUP_SIEVE_PRESENT,
)
SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS: Type[Exception]
if SOUP_SIEVE_PRESENT:
from soupsieve import __version__, SelectorSyntaxError
# Some behavior changes in soupsieve 2.6 that affects one of our
# tests. For the test to run under all versions of Python
# supported by Beautiful Soup (which includes versions of Python
# not supported by soupsieve 2.6) we need to check both behaviors.
SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS = SelectorSyntaxError
if Version(__version__) < Version("2.6"):
SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS = NotImplementedError
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality.
This functionality is implemented in soupsieve, which has a much
more comprehensive test suite, so this is basically an extra check
that soupsieve works as expected.
"""
HTML = """
The title
Hello there.
English
English UK
English US
French
"""
def setup_method(self):
self._soup = BeautifulSoup(self.HTML, "html.parser")
def assert_css_selects(
self, selector: str, expected_ids: List[str], **kwargs: Any
) -> None:
results = self._soup.select(selector, **kwargs)
assert isinstance(results, ResultSet)
el_ids = [el["id"] for el in results]
el_ids.sort()
expected_ids.sort()
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
selector,
", ".join(expected_ids),
", ".join(el_ids),
)
assertSelect = assert_css_selects
def assert_css_select_multiple(self, *tests: Tuple[str, List[str]]):
for selector, expected_ids in tests:
self.assert_css_selects(selector, expected_ids)
def test_precompiled(self):
sel = self._soup.css.compile("div")
els = self._soup.select(sel)
assert len(els) == 4
for div in els:
assert div.name == "div"
el = self._soup.select_one(sel)
assert "main" == el["id"]
def test_one_tag_one(self):
els = self._soup.select("title")
assert len(els) == 1
assert els[0].name == "title"
assert els[0].contents == ["The title"]
def test_one_tag_many(self):
els = self._soup.select("div")
assert len(els) == 4
for div in els:
assert div.name == "div"
el = self._soup.select_one("div")
assert "main" == el["id"]
def test_select_one_returns_none_if_no_match(self):
match = self._soup.select_one("nonexistenttag")
assert None is match
def test_tag_in_tag_one(self):
self.assert_css_selects("div div", ["inner", "data1"])
def test_tag_in_tag_many(self):
for selector in ("html div", "html body div", "body div"):
self.assert_css_selects(selector, ["data1", "main", "inner", "footer"])
def test_limit(self):
self.assert_css_selects("html div", ["main"], limit=1)
self.assert_css_selects("html body div", ["inner", "main"], limit=2)
self.assert_css_selects(
"body div", ["data1", "main", "inner", "footer"], limit=10
)
def test_tag_no_match(self):
assert len(self._soup.select("del")) == 0
def test_invalid_tag(self):
with pytest.raises(SelectorSyntaxError):
self._soup.select("tag%t")
def test_select_dashed_tag_ids(self):
self.assert_css_selects("custom-dashed-tag", ["dash1", "dash2"])
def test_select_dashed_by_id(self):
dashed = self._soup.select('custom-dashed-tag[id="dash2"]')
assert dashed[0].name == "custom-dashed-tag"
assert dashed[0]["id"] == "dash2"
def test_dashed_tag_text(self):
assert self._soup.select("body > custom-dashed-tag")[0].text == "Hello there."
def test_select_dashed_matches_find_all(self):
assert self._soup.select("custom-dashed-tag") == self._soup.find_all(
"custom-dashed-tag"
)
def test_header_tags(self):
self.assert_css_select_multiple(
("h1", ["header1"]),
("h2", ["header2", "header3"]),
)
def test_class_one(self):
for selector in (".onep", "p.onep", "html p.onep"):
els = self._soup.select(selector)
assert len(els) == 1
assert els[0].name == "p"
assert els[0]["class"] == ["onep"]
def test_class_mismatched_tag(self):
els = self._soup.select("div.onep")
assert len(els) == 0
def test_one_id(self):
for selector in ("div#inner", "#inner", "div div#inner"):
self.assert_css_selects(selector, ["inner"])
def test_bad_id(self):
els = self._soup.select("#doesnotexist")
assert len(els) == 0
def test_items_in_id(self):
els = self._soup.select("div#inner p")
assert len(els) == 3
for el in els:
assert el.name == "p"
assert els[1]["class"] == ["onep"]
assert not els[0].has_attr("class")
def test_a_bunch_of_emptys(self):
for selector in ("div#main del", "div#main div.oops", "div div#main"):
assert len(self._soup.select(selector)) == 0
def test_multi_class_support(self):
for selector in (
".class1",
"p.class1",
".class2",
"p.class2",
".class3",
"p.class3",
"html p.class2",
"div#inner .class2",
):
self.assert_css_selects(selector, ["pmulti"])
def test_multi_class_selection(self):
for selector in (".class1.class3", ".class3.class2", ".class1.class2.class3"):
self.assert_css_selects(selector, ["pmulti"])
def test_child_selector(self):
self.assert_css_selects(".s1 > a", ["s1a1", "s1a2"])
self.assert_css_selects(".s1 > a span", ["s1a2s1"])
def test_child_selector_id(self):
self.assert_css_selects(".s1 > a#s1a2 span", ["s1a2s1"])
def test_attribute_equals(self):
self.assert_css_select_multiple(
('p[class="onep"]', ["p1"]),
('p[id="p1"]', ["p1"]),
('[class="onep"]', ["p1"]),
('[id="p1"]', ["p1"]),
('link[rel="stylesheet"]', ["l1"]),
('link[type="text/css"]', ["l1"]),
('link[href="blah.css"]', ["l1"]),
('link[href="no-blah.css"]', []),
('[rel="stylesheet"]', ["l1"]),
('[type="text/css"]', ["l1"]),
('[href="blah.css"]', ["l1"]),
('[href="no-blah.css"]', []),
('p[href="no-blah.css"]', []),
('[href="no-blah.css"]', []),
)
def test_attribute_tilde(self):
self.assert_css_select_multiple(
('p[class~="class1"]', ["pmulti"]),
('p[class~="class2"]', ["pmulti"]),
('p[class~="class3"]', ["pmulti"]),
('[class~="class1"]', ["pmulti"]),
('[class~="class2"]', ["pmulti"]),
('[class~="class3"]', ["pmulti"]),
('a[rel~="friend"]', ["bob"]),
('a[rel~="met"]', ["bob"]),
('[rel~="friend"]', ["bob"]),
('[rel~="met"]', ["bob"]),
)
def test_attribute_startswith(self):
self.assert_css_select_multiple(
('[rel^="style"]', ["l1"]),
('link[rel^="style"]', ["l1"]),
('notlink[rel^="notstyle"]', []),
('[rel^="notstyle"]', []),
('link[rel^="notstyle"]', []),
('link[href^="bla"]', ["l1"]),
('a[href^="http://"]', ["bob", "me"]),
('[href^="http://"]', ["bob", "me"]),
('[id^="p"]', ["pmulti", "p1"]),
('[id^="m"]', ["me", "main"]),
('div[id^="m"]', ["main"]),
('a[id^="m"]', ["me"]),
('div[data-tag^="dashed"]', ["data1"]),
)
def test_attribute_endswith(self):
self.assert_css_select_multiple(
('[href$=".css"]', ["l1"]),
('link[href$=".css"]', ["l1"]),
('link[id$="1"]', ["l1"]),
(
'[id$="1"]',
["data1", "l1", "p1", "header1", "s1a1", "s2a1", "s1a2s1", "dash1"],
),
('div[id$="1"]', ["data1"]),
('[id$="noending"]', []),
)
def test_attribute_contains(self):
self.assert_css_select_multiple(
# From test_attribute_startswith
('[rel*="style"]', ["l1"]),
('link[rel*="style"]', ["l1"]),
('notlink[rel*="notstyle"]', []),
('[rel*="notstyle"]', []),
('link[rel*="notstyle"]', []),
('link[href*="bla"]', ["l1"]),
('[href*="http://"]', ["bob", "me"]),
('[id*="p"]', ["pmulti", "p1"]),
('div[id*="m"]', ["main"]),
('a[id*="m"]', ["me"]),
# From test_attribute_endswith
('[href*=".css"]', ["l1"]),
('link[href*=".css"]', ["l1"]),
('link[id*="1"]', ["l1"]),
(
'[id*="1"]',
[
"data1",
"l1",
"p1",
"header1",
"s1a1",
"s1a2",
"s2a1",
"s1a2s1",
"dash1",
],
),
('div[id*="1"]', ["data1"]),
('[id*="noending"]', []),
# New for this test
('[href*="."]', ["bob", "me", "l1"]),
('a[href*="."]', ["bob", "me"]),
('link[href*="."]', ["l1"]),
('div[id*="n"]', ["main", "inner"]),
('div[id*="nn"]', ["inner"]),
('div[data-tag*="edval"]', ["data1"]),
)
def test_attribute_exact_or_hypen(self):
self.assert_css_select_multiple(
('p[lang|="en"]', ["lang-en", "lang-en-gb", "lang-en-us"]),
('[lang|="en"]', ["lang-en", "lang-en-gb", "lang-en-us"]),
('p[lang|="fr"]', ["lang-fr"]),
('p[lang|="gb"]', []),
)
def test_attribute_exists(self):
self.assert_css_select_multiple(
("[rel]", ["l1", "bob", "me"]),
("link[rel]", ["l1"]),
("a[rel]", ["bob", "me"]),
("[lang]", ["lang-en", "lang-en-gb", "lang-en-us", "lang-fr"]),
("p[class]", ["p1", "pmulti"]),
("[blah]", []),
("p[blah]", []),
("div[data-tag]", ["data1"]),
)
def test_quoted_space_in_selector_name(self):
html = """nope
yes
"""
soup = BeautifulSoup(html, "html.parser")
[chosen] = soup.select('div[style="display: right"]')
assert "yes" == chosen.string
def test_unsupported_pseudoclass(self):
with pytest.raises(SOUPSIEVE_EXCEPTION_ON_UNSUPPORTED_PSEUDOCLASS):
self._soup.select("a:no-such-pseudoclass")
with pytest.raises(SelectorSyntaxError):
self._soup.select("a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
els = self._soup.select("div#inner p:nth-of-type(1)")
assert len(els) == 1
assert els[0].string == "Some text"
# Try to select third paragraph
els = self._soup.select("div#inner p:nth-of-type(3)")
assert len(els) == 1
assert els[0].string == "Another"
# Try to select (non-existent!) fourth paragraph
els = self._soup.select("div#inner p:nth-of-type(4)")
assert len(els) == 0
# Zero will select no tags.
els = self._soup.select("div p:nth-of-type(0)")
assert len(els) == 0
def test_nth_of_type_direct_descendant(self):
els = self._soup.select("div#inner > p:nth-of-type(1)")
assert len(els) == 1
assert els[0].string == "Some text"
def test_id_child_selector_nth_of_type(self):
self.assert_css_selects("#inner > p:nth-of-type(2)", ["p1"])
def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element
# within the tree.
inner = self._soup.find("div", id="main")
selected = inner.select("div")
# The tag was selected. The