# Custom type aliases used throughout Beautiful Soup to improve readability. # Notes on improvements to the type system in newer versions of Python # that can be used once Beautiful Soup drops support for older # versions: # # * ClassVar can be put on class variables now. # * In 3.10, x|y is an accepted shorthand for Union[x,y]. # * In 3.10, TypeAlias gains capabilities that can be used to # improve the tree matching types (I don't remember what, exactly). # * In 3.9 it's possible to specialize the re.Match type, # e.g. re.Match[str]. In 3.8 there's a typing.re namespace for this, # but it's removed in 3.12, so to support the widest possible set of # versions I'm not using it. from typing_extensions import ( runtime_checkable, Protocol, TypeAlias, ) from typing import ( Any, Callable, Dict, IO, Iterable, Mapping, Optional, Pattern, TYPE_CHECKING, Union, ) if TYPE_CHECKING: from bs4.element import ( AttributeValueList, NamespacedAttribute, NavigableString, PageElement, ResultSet, Tag, ) @runtime_checkable class _RegularExpressionProtocol(Protocol): """A protocol object which can accept either Python's built-in `re.Pattern` objects, or the similar ``Regex`` objects defined by the third-party ``regex`` package. """ def search( self, string: str, pos: int = ..., endpos: int = ... ) -> Optional[Any]: ... @property def pattern(self) -> str: ... # Aliases for markup in various stages of processing. # #: The rawest form of markup: either a string, bytestring, or an open filehandle. _IncomingMarkup: TypeAlias = Union[str, bytes, IO[str], IO[bytes]] #: Markup that is in memory but has (potentially) yet to be converted #: to Unicode. _RawMarkup: TypeAlias = Union[str, bytes] # Aliases for character encodings # #: A data encoding. _Encoding: TypeAlias = str #: One or more data encodings. _Encodings: TypeAlias = Iterable[_Encoding] # Aliases for XML namespaces # #: The prefix for an XML namespace. _NamespacePrefix: TypeAlias = str #: The URL of an XML namespace _NamespaceURL: TypeAlias = str #: A mapping of prefixes to namespace URLs. _NamespaceMapping: TypeAlias = Dict[_NamespacePrefix, _NamespaceURL] #: A mapping of namespace URLs to prefixes _InvertedNamespaceMapping: TypeAlias = Dict[_NamespaceURL, _NamespacePrefix] # Aliases for the attribute values associated with HTML/XML tags. # #: The value associated with an HTML or XML attribute. This is the #: relatively unprocessed value Beautiful Soup expects to come from a #: `TreeBuilder`. _RawAttributeValue: TypeAlias = str #: A dictionary of names to `_RawAttributeValue` objects. This is how #: Beautiful Soup expects a `TreeBuilder` to represent a tag's #: attribute values. _RawAttributeValues: TypeAlias = ( "Mapping[Union[str, NamespacedAttribute], _RawAttributeValue]" ) #: An attribute value in its final form, as stored in the # `Tag` class, after it has been processed and (in some cases) # split into a list of strings. _AttributeValue: TypeAlias = Union[str, "AttributeValueList"] #: A dictionary of names to :py:data:`_AttributeValue` objects. This is what #: a tag's attributes look like after processing. _AttributeValues: TypeAlias = Dict[str, _AttributeValue] #: The methods that deal with turning :py:data:`_RawAttributeValue` into #: :py:data:`_AttributeValue` may be called several times, even after the values #: are already processed (e.g. when cloning a tag), so they need to #: be able to acommodate both possibilities. _RawOrProcessedAttributeValues: TypeAlias = Union[_RawAttributeValues, _AttributeValues] #: A number of tree manipulation methods can take either a `PageElement` or a #: normal Python string (which will be converted to a `NavigableString`). _InsertableElement: TypeAlias = Union["PageElement", str] # Aliases to represent the many possibilities for matching bits of a # parse tree. # # This is very complicated because we're applying a formal type system # to some very DWIM code. The types we end up with will be the types # of the arguments to the SoupStrainer constructor and (more # familiarly to Beautiful Soup users) the find* methods. #: A function that takes a PageElement and returns a yes-or-no answer. _PageElementMatchFunction: TypeAlias = Callable[["PageElement"], bool] #: A function that takes the raw parsed ingredients of a markup tag #: and returns a yes-or-no answer. # Not necessary at the moment. # _AllowTagCreationFunction:TypeAlias = Callable[[Optional[str], str, Optional[_RawAttributeValues]], bool] #: A function that takes the raw parsed ingredients of a markup string node #: and returns a yes-or-no answer. # Not necessary at the moment. # _AllowStringCreationFunction:TypeAlias = Callable[[Optional[str]], bool] #: A function that takes a `Tag` and returns a yes-or-no answer. #: A `TagNameMatchRule` expects this kind of function, if you're #: going to pass it a function. _TagMatchFunction: TypeAlias = Callable[["Tag"], bool] #: A function that takes a single string and returns a yes-or-no #: answer. An `AttributeValueMatchRule` expects this kind of function, if #: you're going to pass it a function. So does a `StringMatchRule`. _StringMatchFunction: TypeAlias = Callable[[str], bool] #: Either a tag name, an attribute value or a string can be matched #: against a string, bytestring, regular expression, or a boolean. _BaseStrainable: TypeAlias = Union[str, bytes, Pattern[str], bool] #: A tag can be matched either with the `_BaseStrainable` options, or #: using a function that takes the `Tag` as its sole argument. _BaseStrainableElement: TypeAlias = Union[_BaseStrainable, _TagMatchFunction] #: A tag's attribute vgalue can be matched either with the #: `_BaseStrainable` options, or using a function that takes that #: value as its sole argument. _BaseStrainableAttribute: TypeAlias = Union[_BaseStrainable, _StringMatchFunction] #: A tag can be matched using either a single criterion or a list of #: criteria. _StrainableElement: TypeAlias = Union[ _BaseStrainableElement, Iterable[_BaseStrainableElement] ] #: An attribute value can be matched using either a single criterion #: or a list of criteria. _StrainableAttribute: TypeAlias = Union[ _BaseStrainableAttribute, Iterable[_BaseStrainableAttribute] ] #: An string can be matched using the same techniques as #: an attribute value. _StrainableString: TypeAlias = _StrainableAttribute #: A dictionary may be used to match against multiple attribute vlaues at once. _StrainableAttributes: TypeAlias = Dict[str, _StrainableAttribute] #: Many Beautiful soup methods return a PageElement or an ResultSet of #: PageElements. A PageElement is either a Tag or a NavigableString. #: These convenience aliases make it easier for IDE users to see which methods #: are available on the objects they're dealing with. _OneElement: TypeAlias = Union["PageElement", "Tag", "NavigableString"] _AtMostOneElement: TypeAlias = Optional[_OneElement] _QueryResults: TypeAlias = "ResultSet[_OneElement]"