from __future__ import annotations
from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
from typing_extensions import TypeAlias
from bs4.dammit import EntitySubstitution
if TYPE_CHECKING:
from bs4._typing import _AttributeValue
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
Formatters are passed in as the `formatter` argument to methods
like `bs4.element.Tag.encode`. Most people won't need to
think about formatters, and most people who need to think about
them can pass in one of these predefined strings as `formatter`
rather than making a new Formatter object:
For HTML documents:
* 'html' - HTML entity substitution for generic HTML documents. (default)
* 'html5' - HTML entity substitution for HTML5 documents, as
well as some optimizations in the way tags are rendered.
* 'html5-4.12.0' - The version of the 'html5' formatter used prior to
Beautiful Soup 4.13.0.
* 'minimal' - Only make the substitutions necessary to guarantee
valid HTML.
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
For XML documents:
* 'html' - Entity substitution for XHTML documents.
* 'minimal' - Only make the substitutions necessary to guarantee
valid XML. (default)
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
"""
#: Constant name denoting HTML markup
HTML: str = "html"
#: Constant name denoting XML markup
XML: str = "xml"
#: Default values for the various constructor options when the
#: markup language is HTML.
HTML_DEFAULTS: Dict[str, Set[str]] = dict(
cdata_containing_tags=set(["script", "style"]),
)
language: Optional[str] #: :meta private:
entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private:
void_element_close_prefix: str #: :meta private:
cdata_containing_tags: Set[str] #: :meta private:
indent: str #: :meta private:
#: If this is set to true by the constructor, then attributes whose
#: values are sent to the empty string will be treated as HTML
#: boolean attributes. (Attributes whose value is None are always
#: rendered this way.)
empty_attributes_are_booleans: bool
def _default(
self, language: str, value: Optional[Set[str]], kwarg: str
) -> Set[str]:
if value is not None:
return value
if language == self.XML:
# When XML is the markup language in use, all of the
# defaults are the empty list.
return set()
# Otherwise, it depends on what's in HTML_DEFAULTS.
return self.HTML_DEFAULTS[kwarg]
def __init__(
self,
language: Optional[str] = None,
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
void_element_close_prefix: str = "/",
cdata_containing_tags: Optional[Set[str]] = None,
empty_attributes_are_booleans: bool = False,
indent: Union[int,str] = 1,
):
r"""Constructor.
:param language: This should be `Formatter.XML` if you are formatting
XML markup and `Formatter.HTML` if you are formatting HTML markup.
:param entity_substitution: A function to call to replace special
characters with XML/HTML entities. For examples, see
bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
:param void_element_close_prefix: By default, void elements
are represented as (XML rules) rather than
(HTML rules). To get , pass in the empty string.
:param cdata_containing_tags: The set of tags that are defined
as containing CDATA in this dialect. For example, in HTML,