BeautifulSoup4 python 3

2014-10-06 11:12:35 +02:00
parent 5b7e814166
commit 6b4e6857de
9 changed files with 138 additions and 138 deletions
@@ -45,7 +45,7 @@ from .element import (

 # The very first thing we do is give a useful error if someone is
 # running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'

 class BeautifulSoup(Tag):
    """
@@ -69,7 +69,7 @@ class BeautifulSoup(Tag):
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    """
-    ROOT_TAG_NAME = u'[document]'
+    ROOT_TAG_NAME = '[document]'

    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
@@ -135,12 +135,12 @@ class BeautifulSoup(Tag):
            "fromEncoding", "from_encoding")

        if len(kwargs) > 0:
-            arg = kwargs.keys().pop()
+            arg = list(kwargs.keys()).pop()
            raise TypeError(
                "__init__() got an unexpected keyword argument '%s'" % arg)

        if builder is None:
-            if isinstance(features, basestring):
+            if isinstance(features, str):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
@@ -164,7 +164,7 @@ class BeautifulSoup(Tag):
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
            # just in case that's what the user really wants.
-            if (isinstance(markup, unicode)
+            if (isinstance(markup, str)
                and not os.path.supports_unicode_filenames):
                possible_filename = markup.encode("utf8")
            else:
@@ -172,7 +172,7 @@ class BeautifulSoup(Tag):
            is_file = False
            try:
                is_file = os.path.exists(possible_filename)
-            except Exception, e:
+            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
@@ -184,7 +184,7 @@ class BeautifulSoup(Tag):
                # TODO: This is ugly but I couldn't get it to work in
                # Python 3 otherwise.
                if ((isinstance(markup, bytes) and not b' ' in markup)
-                    or (isinstance(markup, unicode) and not u' ' in markup)):
+                    or (isinstance(markup, str) and not ' ' in markup)):
                    warnings.warn(
                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

@@ -259,7 +259,7 @@ class BeautifulSoup(Tag):

    def endData(self, containerClass=NavigableString):
        if self.current_data:
-            current_data = u''.join(self.current_data)
+            current_data = ''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
            # nothing but ASCII spaces, replace it with a single space
            # or newline.
@@ -367,9 +367,9 @@ class BeautifulSoup(Tag):
            encoding_part = ''
            if eventual_encoding != None:
                encoding_part = ' encoding="%s"' % eventual_encoding
-            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
        else:
-            prefix = u''
+            prefix = ''
        if not pretty_print:
            indent_level = None
        else:
@@ -403,4 +403,4 @@ class FeatureNotFound(ValueError):
 if __name__ == '__main__':
    import sys
    soup = BeautifulSoup(sys.stdin)
-    print soup.prettify()
+    print(soup.prettify())
@@ -153,13 +153,13 @@ class TreeBuilder(object):
            universal = self.cdata_list_attributes.get('*', [])
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), None)
-            for attr in attrs.keys():
+            for attr in list(attrs.keys()):
                if attr in universal or (tag_specific and attr in tag_specific):
                    # We have a "class"-type attribute whose string
                    # value is a whitespace-separated list of
                    # values. Split it into a list.
                    value = attrs[attr]
-                    if isinstance(value, basestring):
+                    if isinstance(value, str):
                        values = whitespace_re.split(value)
                    else:
                        # html5lib sometimes calls setAttributes twice
@@ -37,7 +37,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
@@ -51,7 +51,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<html><head></head><body>%s</body></html>' % fragment
+        return '<html><head></head><body>%s</body></html>' % fragment


 class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@@ -124,7 +124,7 @@ class Element(html5lib.treebuilders._base.Node):

    def appendChild(self, node):
        string_child = child = None
-        if isinstance(node, basestring):
+        if isinstance(node, str):
            # Some other piece of code decided to pass in a string
            # instead of creating a TextElement object to contain the
            # string.
@@ -139,7 +139,7 @@ class Element(html5lib.treebuilders._base.Node):
        else:
            child = node.element

-        if not isinstance(child, basestring) and child.parent is not None:
+        if not isinstance(child, str) and child.parent is not None:
            node.element.extract()

        if (string_child and self.element.contents
@@ -152,7 +152,7 @@ class Element(html5lib.treebuilders._base.Node):
            old_element.replace_with(new_element)
            self.soup._most_recent_element = new_element
        else:
-            if isinstance(node, basestring):
+            if isinstance(node, str):
                # Create a brand new NavigableString from this string.
                child = self.soup.new_string(node)

@@ -183,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node):

            self.soup.builder._replace_cdata_list_attribute_values(
                self.name, attributes)
-            for name, value in attributes.items():
+            for name, value in list(attributes.items()):
                self.element[name] = value

            # The attributes may contain variables that need substitution.
@@ -4,7 +4,7 @@ __all__ = [
    'HTMLParserTreeBuilder',
    ]

-from HTMLParser import (
+from html.parser import (
    HTMLParser,
    HTMLParseError,
    )
@@ -72,9 +72,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
            real_name = int(name)

        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
+            data = chr(real_name)
+        except (ValueError, OverflowError) as e:
+            data = "\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

@@ -142,7 +142,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        """
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            yield (markup, None, None, False)
            return

@@ -158,7 +158,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser.soup = self.soup
        try:
            parser.feed(markup)
-        except HTMLParseError, e:
+        except HTMLParseError as e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
@@ -4,7 +4,7 @@ __all__ = [
    ]

 from io import BytesIO
-from StringIO import StringIO
+from io import StringIO
 import collections
 from lxml import etree
 from bs4.element import Comment, Doctype, NamespacedAttribute
@@ -78,12 +78,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):

        Each 4-tuple represents a strategy for parsing the document.
        """
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
            yield markup, None, document_declared_encoding, False

-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # No, apparently not. Convert the Unicode to UTF-8 and
            # tell lxml to parse it as UTF-8.
            yield (markup.encode("utf8"), "utf8",
@@ -102,7 +102,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def feed(self, markup):
        if isinstance(markup, bytes):
            markup = BytesIO(markup)
-        elif isinstance(markup, unicode):
+        elif isinstance(markup, str):
            markup = StringIO(markup)

        # Call feed() at least once, even if the markup is empty,
@@ -117,7 +117,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                if len(data) != 0:
                    self.parser.feed(data)
            self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
            raise ParserRejectedMarkup(str(e))

    def close(self):
@@ -135,12 +135,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
-            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
            self.nsmaps.append(inverted_nsmap)
            # Also treat the namespace mapping as a set of attributes on the
            # tag, so we can recreate it later.
            attrs = attrs.copy()
-            for prefix, namespace in nsmap.items():
+            for prefix, namespace in list(nsmap.items()):
                attribute = NamespacedAttribute(
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace
@@ -149,7 +149,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # from lxml with namespaces attached to their names, and
        # turn then into NamespacedAttribute objects.
        new_attrs = {}
-        for attr, value in attrs.items():
+        for attr, value in list(attrs.items()):
            namespace, attr = self._getNsTag(attr)
            if namespace is None:
                new_attrs[attr] = value
@@ -207,7 +207,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment


 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@@ -224,10 +224,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
            self.parser = self.parser_for(encoding)
            self.parser.feed(markup)
            self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
            raise ParserRejectedMarkup(str(e))


    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<html><body>%s</body></html>' % fragment
+        return '<html><body>%s</body></html>' % fragment
@@ -8,7 +8,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
 """

 import codecs
-from htmlentitydefs import codepoint2name
+from html.entities import codepoint2name
 import re
 import logging
 import string
@@ -56,7 +56,7 @@ class EntitySubstitution(object):
        reverse_lookup = {}
        characters_for_re = []
        for codepoint, name in list(codepoint2name.items()):
-            character = unichr(codepoint)
+            character = chr(codepoint)
            if codepoint != 34:
                # There's no point in turning the quotation mark into
                # &quot;, unless it happens within an attribute value, which
@@ -340,9 +340,9 @@ class UnicodeDammit:
        self.detector = EncodingDetector(markup, override_encodings, is_html)

        # Short-circuit if the data is in Unicode to begin with.
-        if isinstance(markup, unicode) or markup == '':
+        if isinstance(markup, str) or markup == '':
            self.markup = markup
-            self.unicode_markup = unicode(markup)
+            self.unicode_markup = str(markup)
            self.original_encoding = None
            return

@@ -425,7 +425,7 @@ class UnicodeDammit:
    def _to_unicode(self, data, encoding, errors="strict"):
        '''Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliases'''
-        return unicode(data, encoding, errors)
+        return str(data, encoding, errors)

    @property
    def declared_html_encoding(self):
@@ -1,7 +1,7 @@
 """Diagnostic functions, mainly for use when doing tech support."""
 import cProfile
-from StringIO import StringIO
-from HTMLParser import HTMLParser
+from io import StringIO
+from html.parser import HTMLParser
 import bs4
 from bs4 import BeautifulSoup, __version__
 from bs4.builder import builder_registry
@@ -17,8 +17,8 @@ import cProfile

 def diagnose(data):
    """Diagnostic suite for isolating common problems."""
-    print "Diagnostic running on Beautiful Soup %s" % __version__
-    print "Python version %s" % sys.version
+    print("Diagnostic running on Beautiful Soup %s" % __version__)
+    print("Python version %s" % sys.version)

    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
@@ -27,44 +27,44 @@ def diagnose(data):
                break
        else:
            basic_parsers.remove(name)
-            print (
+            print((
                "I noticed that %s is not installed. Installing it may help." %
-                name)
+                name))

    if 'lxml' in basic_parsers:
        basic_parsers.append(["lxml", "xml"])
        from lxml import etree
-        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))

    if 'html5lib' in basic_parsers:
        import html5lib
-        print "Found html5lib version %s" % html5lib.__version__
+        print("Found html5lib version %s" % html5lib.__version__)

    if hasattr(data, 'read'):
        data = data.read()
    elif os.path.exists(data):
-        print '"%s" looks like a filename. Reading data from the file.' % data
+        print('"%s" looks like a filename. Reading data from the file.' % data)
        data = open(data).read()
    elif data.startswith("http:") or data.startswith("https:"):
-        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
-        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
        return
-    print
+    print()

    for parser in basic_parsers:
-        print "Trying to parse your markup with %s" % parser
+        print("Trying to parse your markup with %s" % parser)
        success = False
        try:
            soup = BeautifulSoup(data, parser)
            success = True
-        except Exception, e:
-            print "%s could not parse the markup." % parser
+        except Exception as e:
+            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
-            print "Here's what %s did with the markup:" % parser
-            print soup.prettify()
+            print("Here's what %s did with the markup:" % parser)
+            print(soup.prettify())

-        print "-" * 80
+        print("-" * 80)

 def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.
@@ -74,7 +74,7 @@ def lxml_trace(data, html=True, **kwargs):
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
-        print("%s, %4s, %s" % (event, element.tag, element.text))
+        print(("%s, %4s, %s" % (event, element.tag, element.text)))

 class AnnouncingParser(HTMLParser):
    """Announces HTMLParser parse events, without doing anything else."""
@@ -156,9 +156,9 @@ def rdoc(num_elements=1000):

 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
-    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
-    print "Generated a large invalid HTML document (%d bytes)." % len(data)
+    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
@@ -167,24 +167,24 @@ def benchmark_parsers(num_elements=100000):
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
-        except Exception, e:
-            print "%s could not parse the markup." % parser
+        except Exception as e:
+            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
-            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
-    print "Raw lxml parsed the markup in %.2fs." % (b-a)
+    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
-    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

 def profile(num_elements=100000, parser="lxml"):

@@ -21,22 +21,22 @@ def _alias(attr):
    return alias


-class NamespacedAttribute(unicode):
+class NamespacedAttribute(str):

    def __new__(cls, prefix, name, namespace=None):
        if name is None:
-            obj = unicode.__new__(cls, prefix)
+            obj = str.__new__(cls, prefix)
        elif prefix is None:
            # Not really namespaced.
-            obj = unicode.__new__(cls, name)
+            obj = str.__new__(cls, name)
        else:
-            obj = unicode.__new__(cls, prefix + ":" + name)
+            obj = str.__new__(cls, prefix + ":" + name)
        obj.prefix = prefix
        obj.name = name
        obj.namespace = namespace
        return obj

-class AttributeValueWithCharsetSubstitution(unicode):
+class AttributeValueWithCharsetSubstitution(str):
    """A stand-in object for a character encoding specified in HTML."""

 class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@@ -47,7 +47,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
    """

    def __new__(cls, original_value):
-        obj = unicode.__new__(cls, original_value)
+        obj = str.__new__(cls, original_value)
        obj.original_value = original_value
        return obj

@@ -70,9 +70,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
        match = cls.CHARSET_RE.search(original_value)
        if match is None:
            # No substitution necessary.
-            return unicode.__new__(unicode, original_value)
+            return str.__new__(str, original_value)

-        obj = unicode.__new__(cls, original_value)
+        obj = str.__new__(cls, original_value)
        obj.original_value = original_value
        return obj

@@ -152,7 +152,7 @@ class PageElement(object):

    def format_string(self, s, formatter='minimal'):
        """Format the given string using the given formatter."""
-        if not callable(formatter):
+        if not isinstance(formatter, collections.Callable):
            formatter = self._formatter_for_name(formatter)
        if formatter is None:
            output = s
@@ -272,7 +272,7 @@ class PageElement(object):
    def insert(self, position, new_child):
        if new_child is self:
            raise ValueError("Cannot insert a tag into itself.")
-        if (isinstance(new_child, basestring)
+        if (isinstance(new_child, str)
            and not isinstance(new_child, NavigableString)):
            new_child = NavigableString(new_child)

@@ -489,7 +489,7 @@ class PageElement(object):
                result = (element for element in generator
                          if isinstance(element, Tag))
                return ResultSet(strainer, result)
-            elif isinstance(name, basestring):
+            elif isinstance(name, str):
                # Optimization to find all tags with a given name.
                result = (element for element in generator
                          if isinstance(element, Tag)
@@ -640,7 +640,7 @@ class PageElement(object):
        return self.parents


-class NavigableString(unicode, PageElement):
+class NavigableString(str, PageElement):

    PREFIX = ''
    SUFFIX = ''
@@ -653,15 +653,15 @@ class NavigableString(unicode, PageElement):
        passed in to the superclass's __new__ or the superclass won't know
        how to handle non-ASCII characters.
        """
-        if isinstance(value, unicode):
-            return unicode.__new__(cls, value)
-        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+        if isinstance(value, str):
+            return str.__new__(cls, value)
+        return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)

    def __copy__(self):
        return self

    def __getnewargs__(self):
-        return (unicode(self),)
+        return (str(self),)

    def __getattr__(self, attr):
        """text.string gives you text. This is for backwards
@@ -701,23 +701,23 @@ class PreformattedString(NavigableString):

 class CData(PreformattedString):

-    PREFIX = u'<![CDATA['
-    SUFFIX = u']]>'
+    PREFIX = '<![CDATA['
+    SUFFIX = ']]>'

 class ProcessingInstruction(PreformattedString):

-    PREFIX = u'<?'
-    SUFFIX = u'?>'
+    PREFIX = '<?'
+    SUFFIX = '?>'

 class Comment(PreformattedString):

-    PREFIX = u'<!--'
-    SUFFIX = u'-->'
+    PREFIX = '<!--'
+    SUFFIX = '-->'


 class Declaration(PreformattedString):
-    PREFIX = u'<!'
-    SUFFIX = u'!>'
+    PREFIX = '<!'
+    SUFFIX = '!>'


 class Doctype(PreformattedString):
@@ -734,8 +734,8 @@ class Doctype(PreformattedString):

        return Doctype(value)

-    PREFIX = u'<!DOCTYPE '
-    SUFFIX = u'>\n'
+    PREFIX = '<!DOCTYPE '
+    SUFFIX = '>\n'


 class Tag(PageElement):
@@ -843,7 +843,7 @@ class Tag(PageElement):
        for string in self._all_strings(True):
            yield string

-    def get_text(self, separator=u"", strip=False,
+    def get_text(self, separator="", strip=False,
                 types=(NavigableString, CData)):
        """
        Get all child strings, concatenated using the given separator.
@@ -915,7 +915,7 @@ class Tag(PageElement):
    def __contains__(self, x):
        return x in self.contents

-    def __nonzero__(self):
+    def __bool__(self):
        "A tag is non-None even if it has no contents."
        return True

@@ -1014,7 +1014,7 @@ class Tag(PageElement):

        # First off, turn a string formatter into a function. This
        # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, collections.Callable):
            formatter = self._formatter_for_name(formatter)

        attrs = []
@@ -1025,8 +1025,8 @@ class Tag(PageElement):
                else:
                    if isinstance(val, list) or isinstance(val, tuple):
                        val = ' '.join(val)
-                    elif not isinstance(val, basestring):
-                        val = unicode(val)
+                    elif not isinstance(val, str):
+                        val = str(val)
                    elif (
                        isinstance(val, AttributeValueWithCharsetSubstitution)
                        and eventual_encoding is not None):
@@ -1034,7 +1034,7 @@ class Tag(PageElement):

                    text = self.format_string(val, formatter)
                    decoded = (
-                        unicode(key) + '='
+                        str(key) + '='
                        + EntitySubstitution.quoted_attribute_value(text))
                attrs.append(decoded)
        close = ''
@@ -1112,7 +1112,7 @@ class Tag(PageElement):
        """
        # First off, turn a string formatter into a function. This
        # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, collections.Callable):
            formatter = self._formatter_for_name(formatter)

        pretty_print = (indent_level is not None)
@@ -1210,16 +1210,16 @@ class Tag(PageElement):
            raise ValueError(
                'Final combinator "%s" is missing an argument.' % tokens[-1])
        if self._select_debug:
-            print 'Running CSS selector "%s"' % selector
+            print('Running CSS selector "%s"' % selector)
        for index, token in enumerate(tokens):
            if self._select_debug:
-                print ' Considering token "%s"' % token
+                print(' Considering token "%s"' % token)
            recursive_candidate_generator = None
            tag_name = None
            if tokens[index-1] in self._selector_combinators:
                # This token was consumed by the previous combinator. Skip it.
                if self._select_debug:
-                    print '  Token was consumed by the previous combinator.'
+                    print('  Token was consumed by the previous combinator.')
                continue
            # Each operation corresponds to a checker function, a rule
            # for determining whether a candidate matches the
@@ -1325,14 +1325,14 @@ class Tag(PageElement):
                next_token = tokens[index+1]
                def recursive_select(tag):
                    if self._select_debug:
-                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
-                        print '-' * 40
+                        print('    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
+                        print('-' * 40)
                    for i in tag.select(next_token, recursive_candidate_generator):
                        if self._select_debug:
-                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+                            print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
                        yield i
                    if self._select_debug:
-                        print '-' * 40
+                        print('-' * 40)
                _use_candidate_generator = recursive_select
            elif _candidate_generator is None:
                # By default, a tag's candidates are all of its
@@ -1343,7 +1343,7 @@ class Tag(PageElement):
                        check = "[any]"
                    else:
                        check = tag_name
-                    print '   Default candidate generator, tag name="%s"' % check
+                    print('   Default candidate generator, tag name="%s"' % check)
                if self._select_debug:
                    # This is redundant with later code, but it stops
                    # a bunch of bogus tags from cluttering up the
@@ -1365,8 +1365,8 @@ class Tag(PageElement):
            new_context_ids = set([])
            for tag in current_context:
                if self._select_debug:
-                    print "    Running candidate generator on %s %s" % (
-                        tag.name, repr(tag.attrs))
+                    print("    Running candidate generator on %s %s" % (
+                        tag.name, repr(tag.attrs)))
                for candidate in _use_candidate_generator(tag):
                    if not isinstance(candidate, Tag):
                        continue
@@ -1381,21 +1381,21 @@ class Tag(PageElement):
                            break
                    if checker is None or result:
                        if self._select_debug:
-                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+                            print("     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
                        if id(candidate) not in new_context_ids:
                            # If a tag matches a selector more than once,
                            # don't include it in the context more than once.
                            new_context.append(candidate)
                            new_context_ids.add(id(candidate))
                    elif self._select_debug:
-                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+                        print("     FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))

            current_context = new_context

        if self._select_debug:
-            print "Final verdict:"
+            print("Final verdict:")
            for i in current_context:
-                print " %s %s" % (i.name, i.attrs)
+                print(" %s %s" % (i.name, i.attrs))
        return current_context

    # Old names for backwards compatibility
@@ -1439,7 +1439,7 @@ class SoupStrainer(object):
            else:
                attrs = kwargs
        normalized_attrs = {}
-        for key, value in attrs.items():
+        for key, value in list(attrs.items()):
            normalized_attrs[key] = self._normalize_search_value(value)

        self.attrs = normalized_attrs
@@ -1448,7 +1448,7 @@ class SoupStrainer(object):
    def _normalize_search_value(self, value):
        # Leave it alone if it's a Unicode string, a callable, a
        # regular expression, a boolean, or None.
-        if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
+        if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
            or isinstance(value, bool) or value is None):
            return value

@@ -1461,7 +1461,7 @@ class SoupStrainer(object):
            new_value = []
            for v in value:
                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
-                    and not isinstance(v, unicode)):
+                    and not isinstance(v, str)):
                    # This is almost certainly the user's mistake. In the
                    # interests of avoiding infinite loops, we'll let
                    # it through as-is rather than doing a recursive call.
@@ -1473,7 +1473,7 @@ class SoupStrainer(object):
        # Otherwise, convert it into a Unicode string.
        # The unicode(str()) thing is so this will do the same thing on Python 2
        # and Python 3.
-        return unicode(str(value))
+        return str(str(value))

    def __str__(self):
        if self.text:
@@ -1527,7 +1527,7 @@ class SoupStrainer(object):
        found = None
        # If given a list of items, scan it for a text element that
        # matches.
-        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
+        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
            for element in markup:
                if isinstance(element, NavigableString) \
                       and self.search(element):
@@ -1540,7 +1540,7 @@ class SoupStrainer(object):
                found = self.search_tag(markup)
        # If it's text, make sure the text matches.
        elif isinstance(markup, NavigableString) or \
-                 isinstance(markup, basestring):
+                 isinstance(markup, str):
            if not self.name and not self.attrs and self._matches(markup, self.text):
                found = markup
        else:
@@ -1554,7 +1554,7 @@ class SoupStrainer(object):
        if isinstance(markup, list) or isinstance(markup, tuple):
            # This should only happen when searching a multi-valued attribute
            # like 'class'.
-            if (isinstance(match_against, unicode)
+            if (isinstance(match_against, str)
                and ' ' in match_against):
                # A bit of a special case. If they try to match "foo
                # bar" on a multivalue attribute's value, only accept
@@ -1589,7 +1589,7 @@ class SoupStrainer(object):
            # None matches None, False, an empty string, an empty list, and so on.
            return not match_against

-        if isinstance(match_against, unicode):
+        if isinstance(match_against, str):
            # Exact string match
            return markup == match_against

@@ -225,14 +225,14 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')

    def test_entities_in_attributes_converted_to_unicode(self):
-        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+        expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)

    def test_entities_in_text_converted_to_unicode(self):
-        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+        expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@@ -243,7 +243,7 @@ class HTMLTreeBuilderSmokeTest(object):
                              '<p>I said "good day!"</p>')

    def test_out_of_range_entity(self):
-        expect = u"\N{REPLACEMENT CHARACTER}"
+        expect = "\N{REPLACEMENT CHARACTER}"
        self.assertSoupEquals("&#10000000000000;", expect)
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
@@ -285,9 +285,9 @@ class HTMLTreeBuilderSmokeTest(object):
        # A seemingly innocuous document... but it's in Unicode! And
        # it contains characters that can't be represented in the
        # encoding found in the  declaration! The horror!
-        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+        markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
        soup = self.soup(markup)
-        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+        self.assertEqual('Sacr\xe9 bleu!', soup.body.string)

    def test_soupstrainer(self):
        """Parsers should be able to work with SoupStrainers."""
@@ -327,7 +327,7 @@ class HTMLTreeBuilderSmokeTest(object):
        # Both XML and HTML entities are converted to Unicode characters
        # during parsing.
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
-        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
        self.assertSoupEquals(text, expected)

    def test_smart_quotes_converted_on_the_way_in(self):
@@ -337,15 +337,15 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup(quote)
        self.assertEqual(
            soup.p.string,
-            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+            "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")

    def test_non_breaking_spaces_converted_on_the_way_in(self):
        soup = self.soup("<a>&nbsp;&nbsp;</a>")
-        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+        self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)

    def test_entities_converted_on_the_way_out(self):
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
-        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
        soup = self.soup(text)
        self.assertEqual(soup.p.encode("utf-8"), expected)

@@ -354,7 +354,7 @@ class HTMLTreeBuilderSmokeTest(object):
        # easy-to-understand document.

        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
-        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+        unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'

        # That's because we're going to encode it into ISO-Latin-1, and use
        # that to test.
@@ -493,15 +493,15 @@ class XMLTreeBuilderSmokeTest(object):
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)

    def test_can_parse_unicode_document(self):
-        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+        markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
        soup = self.soup(markup)
-        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+        self.assertEqual('Sacr\xe9 bleu!', soup.root.string)

    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
        self.assertEqual(
-            unicode(soup.rss), markup)
+            str(soup.rss), markup)

    def test_docstring_includes_correct_encoding(self):
        soup = self.soup("<root/>")
@@ -532,17 +532,17 @@ class XMLTreeBuilderSmokeTest(object):
    def test_closing_namespaced_tag(self):
        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
        soup = self.soup(markup)
-        self.assertEqual(unicode(soup.p), markup)
+        self.assertEqual(str(soup.p), markup)

    def test_namespaced_attributes(self):
        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
        soup = self.soup(markup)
-        self.assertEqual(unicode(soup.foo), markup)
+        self.assertEqual(str(soup.foo), markup)

    def test_namespaced_attributes_xml_namespace(self):
        markup = '<foo xml:lang="fr">bar</foo>'
        soup = self.soup(markup)
-        self.assertEqual(unicode(soup.foo), markup)
+        self.assertEqual(str(soup.foo), markup)

 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""