BeautifulSoup4 python 3
This commit is contained in:
+11
-11
@@ -45,7 +45,7 @@ from .element import (
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
@@ -69,7 +69,7 @@ class BeautifulSoup(Tag):
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
ROOT_TAG_NAME = '[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
@@ -135,12 +135,12 @@ class BeautifulSoup(Tag):
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
arg = list(kwargs.keys()).pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
if isinstance(features, basestring):
|
||||
if isinstance(features, str):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
@@ -164,7 +164,7 @@ class BeautifulSoup(Tag):
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, unicode)
|
||||
if (isinstance(markup, str)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
@@ -172,7 +172,7 @@ class BeautifulSoup(Tag):
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
@@ -184,7 +184,7 @@ class BeautifulSoup(Tag):
|
||||
# TODO: This is ugly but I couldn't get it to work in
|
||||
# Python 3 otherwise.
|
||||
if ((isinstance(markup, bytes) and not b' ' in markup)
|
||||
or (isinstance(markup, unicode) and not u' ' in markup)):
|
||||
or (isinstance(markup, str) and not ' ' in markup)):
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
||||
|
||||
@@ -259,7 +259,7 @@ class BeautifulSoup(Tag):
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.current_data:
|
||||
current_data = u''.join(self.current_data)
|
||||
current_data = ''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
# nothing but ASCII spaces, replace it with a single space
|
||||
# or newline.
|
||||
@@ -367,9 +367,9 @@ class BeautifulSoup(Tag):
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
prefix = ''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
@@ -403,4 +403,4 @@ class FeatureNotFound(ValueError):
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
||||
print(soup.prettify())
|
||||
|
||||
@@ -153,13 +153,13 @@ class TreeBuilder(object):
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), None)
|
||||
for attr in attrs.keys():
|
||||
for attr in list(attrs.keys()):
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, basestring):
|
||||
if isinstance(value, str):
|
||||
values = whitespace_re.split(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
|
||||
@@ -37,7 +37,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
@@ -51,7 +51,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
return '<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||
@@ -124,7 +124,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
|
||||
def appendChild(self, node):
|
||||
string_child = child = None
|
||||
if isinstance(node, basestring):
|
||||
if isinstance(node, str):
|
||||
# Some other piece of code decided to pass in a string
|
||||
# instead of creating a TextElement object to contain the
|
||||
# string.
|
||||
@@ -139,7 +139,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
else:
|
||||
child = node.element
|
||||
|
||||
if not isinstance(child, basestring) and child.parent is not None:
|
||||
if not isinstance(child, str) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child and self.element.contents
|
||||
@@ -152,7 +152,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, basestring):
|
||||
if isinstance(node, str):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
@@ -183,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
for name, value in list(attributes.items()):
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
|
||||
@@ -4,7 +4,7 @@ __all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import (
|
||||
from html.parser import (
|
||||
HTMLParser,
|
||||
HTMLParseError,
|
||||
)
|
||||
@@ -72,9 +72,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
data = chr(real_name)
|
||||
except (ValueError, OverflowError) as e:
|
||||
data = "\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
@@ -142,7 +142,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
@@ -158,7 +158,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
except HTMLParseError as e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
|
||||
+11
-11
@@ -4,7 +4,7 @@ __all__ = [
|
||||
]
|
||||
|
||||
from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
from io import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||
@@ -78,12 +78,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
|
||||
Each 4-tuple represents a strategy for parsing the document.
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8",
|
||||
@@ -102,7 +102,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, unicode):
|
||||
elif isinstance(markup, str):
|
||||
markup = StringIO(markup)
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
@@ -117,7 +117,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
@@ -135,12 +135,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
for prefix, namespace in list(nsmap.items()):
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
@@ -149,7 +149,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
for attr, value in list(attrs.items()):
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
@@ -207,7 +207,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
@@ -224,10 +224,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
||||
return '<html><body>%s</body></html>' % fragment
|
||||
|
||||
+5
-5
@@ -8,7 +8,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
from html.entities import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
@@ -56,7 +56,7 @@ class EntitySubstitution(object):
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
character = chr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
@@ -340,9 +340,9 @@ class UnicodeDammit:
|
||||
self.detector = EncodingDetector(markup, override_encodings, is_html)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, unicode) or markup == '':
|
||||
if isinstance(markup, str) or markup == '':
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.unicode_markup = str(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
@@ -425,7 +425,7 @@ class UnicodeDammit:
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
return unicode(data, encoding, errors)
|
||||
return str(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
|
||||
+26
-26
@@ -1,7 +1,7 @@
|
||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
import cProfile
|
||||
from StringIO import StringIO
|
||||
from HTMLParser import HTMLParser
|
||||
from io import StringIO
|
||||
from html.parser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
@@ -17,8 +17,8 @@ import cProfile
|
||||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
||||
print "Python version %s" % sys.version
|
||||
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||
print("Python version %s" % sys.version)
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
@@ -27,44 +27,44 @@ def diagnose(data):
|
||||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print (
|
||||
print((
|
||||
"I noticed that %s is not installed. Installing it may help." %
|
||||
name)
|
||||
name))
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append(["lxml", "xml"])
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
print("Found html5lib version %s" % html5lib.__version__)
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif os.path.exists(data):
|
||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||
data = open(data).read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||
return
|
||||
print
|
||||
print()
|
||||
|
||||
for parser in basic_parsers:
|
||||
print "Trying to parse your markup with %s" % parser
|
||||
print("Trying to parse your markup with %s" % parser)
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, parser)
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "Here's what %s did with the markup:" % parser
|
||||
print soup.prettify()
|
||||
print("Here's what %s did with the markup:" % parser)
|
||||
print(soup.prettify())
|
||||
|
||||
print "-" * 80
|
||||
print("-" * 80)
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
@@ -74,7 +74,7 @@ def lxml_trace(data, html=True, **kwargs):
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
@@ -156,9 +156,9 @@ def rdoc(num_elements=1000):
|
||||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||
data = rdoc(num_elements)
|
||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
@@ -167,24 +167,24 @@ def benchmark_parsers(num_elements=100000):
|
||||
soup = BeautifulSoup(data, parser)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
||||
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
||||
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
||||
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
|
||||
+55
-55
@@ -21,22 +21,22 @@ def _alias(attr):
|
||||
return alias
|
||||
|
||||
|
||||
class NamespacedAttribute(unicode):
|
||||
class NamespacedAttribute(str):
|
||||
|
||||
def __new__(cls, prefix, name, namespace=None):
|
||||
if name is None:
|
||||
obj = unicode.__new__(cls, prefix)
|
||||
obj = str.__new__(cls, prefix)
|
||||
elif prefix is None:
|
||||
# Not really namespaced.
|
||||
obj = unicode.__new__(cls, name)
|
||||
obj = str.__new__(cls, name)
|
||||
else:
|
||||
obj = unicode.__new__(cls, prefix + ":" + name)
|
||||
obj = str.__new__(cls, prefix + ":" + name)
|
||||
obj.prefix = prefix
|
||||
obj.name = name
|
||||
obj.namespace = namespace
|
||||
return obj
|
||||
|
||||
class AttributeValueWithCharsetSubstitution(unicode):
|
||||
class AttributeValueWithCharsetSubstitution(str):
|
||||
"""A stand-in object for a character encoding specified in HTML."""
|
||||
|
||||
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||
@@ -47,7 +47,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||
"""
|
||||
|
||||
def __new__(cls, original_value):
|
||||
obj = unicode.__new__(cls, original_value)
|
||||
obj = str.__new__(cls, original_value)
|
||||
obj.original_value = original_value
|
||||
return obj
|
||||
|
||||
@@ -70,9 +70,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||
match = cls.CHARSET_RE.search(original_value)
|
||||
if match is None:
|
||||
# No substitution necessary.
|
||||
return unicode.__new__(unicode, original_value)
|
||||
return str.__new__(str, original_value)
|
||||
|
||||
obj = unicode.__new__(cls, original_value)
|
||||
obj = str.__new__(cls, original_value)
|
||||
obj.original_value = original_value
|
||||
return obj
|
||||
|
||||
@@ -152,7 +152,7 @@ class PageElement(object):
|
||||
|
||||
def format_string(self, s, formatter='minimal'):
|
||||
"""Format the given string using the given formatter."""
|
||||
if not callable(formatter):
|
||||
if not isinstance(formatter, collections.Callable):
|
||||
formatter = self._formatter_for_name(formatter)
|
||||
if formatter is None:
|
||||
output = s
|
||||
@@ -272,7 +272,7 @@ class PageElement(object):
|
||||
def insert(self, position, new_child):
|
||||
if new_child is self:
|
||||
raise ValueError("Cannot insert a tag into itself.")
|
||||
if (isinstance(new_child, basestring)
|
||||
if (isinstance(new_child, str)
|
||||
and not isinstance(new_child, NavigableString)):
|
||||
new_child = NavigableString(new_child)
|
||||
|
||||
@@ -489,7 +489,7 @@ class PageElement(object):
|
||||
result = (element for element in generator
|
||||
if isinstance(element, Tag))
|
||||
return ResultSet(strainer, result)
|
||||
elif isinstance(name, basestring):
|
||||
elif isinstance(name, str):
|
||||
# Optimization to find all tags with a given name.
|
||||
result = (element for element in generator
|
||||
if isinstance(element, Tag)
|
||||
@@ -640,7 +640,7 @@ class PageElement(object):
|
||||
return self.parents
|
||||
|
||||
|
||||
class NavigableString(unicode, PageElement):
|
||||
class NavigableString(str, PageElement):
|
||||
|
||||
PREFIX = ''
|
||||
SUFFIX = ''
|
||||
@@ -653,15 +653,15 @@ class NavigableString(unicode, PageElement):
|
||||
passed in to the superclass's __new__ or the superclass won't know
|
||||
how to handle non-ASCII characters.
|
||||
"""
|
||||
if isinstance(value, unicode):
|
||||
return unicode.__new__(cls, value)
|
||||
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||
if isinstance(value, str):
|
||||
return str.__new__(cls, value)
|
||||
return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||
|
||||
def __copy__(self):
|
||||
return self
|
||||
|
||||
def __getnewargs__(self):
|
||||
return (unicode(self),)
|
||||
return (str(self),)
|
||||
|
||||
def __getattr__(self, attr):
|
||||
"""text.string gives you text. This is for backwards
|
||||
@@ -701,23 +701,23 @@ class PreformattedString(NavigableString):
|
||||
|
||||
class CData(PreformattedString):
|
||||
|
||||
PREFIX = u'<![CDATA['
|
||||
SUFFIX = u']]>'
|
||||
PREFIX = '<![CDATA['
|
||||
SUFFIX = ']]>'
|
||||
|
||||
class ProcessingInstruction(PreformattedString):
|
||||
|
||||
PREFIX = u'<?'
|
||||
SUFFIX = u'?>'
|
||||
PREFIX = '<?'
|
||||
SUFFIX = '?>'
|
||||
|
||||
class Comment(PreformattedString):
|
||||
|
||||
PREFIX = u'<!--'
|
||||
SUFFIX = u'-->'
|
||||
PREFIX = '<!--'
|
||||
SUFFIX = '-->'
|
||||
|
||||
|
||||
class Declaration(PreformattedString):
|
||||
PREFIX = u'<!'
|
||||
SUFFIX = u'!>'
|
||||
PREFIX = '<!'
|
||||
SUFFIX = '!>'
|
||||
|
||||
|
||||
class Doctype(PreformattedString):
|
||||
@@ -734,8 +734,8 @@ class Doctype(PreformattedString):
|
||||
|
||||
return Doctype(value)
|
||||
|
||||
PREFIX = u'<!DOCTYPE '
|
||||
SUFFIX = u'>\n'
|
||||
PREFIX = '<!DOCTYPE '
|
||||
SUFFIX = '>\n'
|
||||
|
||||
|
||||
class Tag(PageElement):
|
||||
@@ -843,7 +843,7 @@ class Tag(PageElement):
|
||||
for string in self._all_strings(True):
|
||||
yield string
|
||||
|
||||
def get_text(self, separator=u"", strip=False,
|
||||
def get_text(self, separator="", strip=False,
|
||||
types=(NavigableString, CData)):
|
||||
"""
|
||||
Get all child strings, concatenated using the given separator.
|
||||
@@ -915,7 +915,7 @@ class Tag(PageElement):
|
||||
def __contains__(self, x):
|
||||
return x in self.contents
|
||||
|
||||
def __nonzero__(self):
|
||||
def __bool__(self):
|
||||
"A tag is non-None even if it has no contents."
|
||||
return True
|
||||
|
||||
@@ -1014,7 +1014,7 @@ class Tag(PageElement):
|
||||
|
||||
# First off, turn a string formatter into a function. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
if not callable(formatter):
|
||||
if not isinstance(formatter, collections.Callable):
|
||||
formatter = self._formatter_for_name(formatter)
|
||||
|
||||
attrs = []
|
||||
@@ -1025,8 +1025,8 @@ class Tag(PageElement):
|
||||
else:
|
||||
if isinstance(val, list) or isinstance(val, tuple):
|
||||
val = ' '.join(val)
|
||||
elif not isinstance(val, basestring):
|
||||
val = unicode(val)
|
||||
elif not isinstance(val, str):
|
||||
val = str(val)
|
||||
elif (
|
||||
isinstance(val, AttributeValueWithCharsetSubstitution)
|
||||
and eventual_encoding is not None):
|
||||
@@ -1034,7 +1034,7 @@ class Tag(PageElement):
|
||||
|
||||
text = self.format_string(val, formatter)
|
||||
decoded = (
|
||||
unicode(key) + '='
|
||||
str(key) + '='
|
||||
+ EntitySubstitution.quoted_attribute_value(text))
|
||||
attrs.append(decoded)
|
||||
close = ''
|
||||
@@ -1112,7 +1112,7 @@ class Tag(PageElement):
|
||||
"""
|
||||
# First off, turn a string formatter into a function. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
if not callable(formatter):
|
||||
if not isinstance(formatter, collections.Callable):
|
||||
formatter = self._formatter_for_name(formatter)
|
||||
|
||||
pretty_print = (indent_level is not None)
|
||||
@@ -1210,16 +1210,16 @@ class Tag(PageElement):
|
||||
raise ValueError(
|
||||
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
||||
if self._select_debug:
|
||||
print 'Running CSS selector "%s"' % selector
|
||||
print('Running CSS selector "%s"' % selector)
|
||||
for index, token in enumerate(tokens):
|
||||
if self._select_debug:
|
||||
print ' Considering token "%s"' % token
|
||||
print(' Considering token "%s"' % token)
|
||||
recursive_candidate_generator = None
|
||||
tag_name = None
|
||||
if tokens[index-1] in self._selector_combinators:
|
||||
# This token was consumed by the previous combinator. Skip it.
|
||||
if self._select_debug:
|
||||
print ' Token was consumed by the previous combinator.'
|
||||
print(' Token was consumed by the previous combinator.')
|
||||
continue
|
||||
# Each operation corresponds to a checker function, a rule
|
||||
# for determining whether a candidate matches the
|
||||
@@ -1325,14 +1325,14 @@ class Tag(PageElement):
|
||||
next_token = tokens[index+1]
|
||||
def recursive_select(tag):
|
||||
if self._select_debug:
|
||||
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
||||
print '-' * 40
|
||||
print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
|
||||
print('-' * 40)
|
||||
for i in tag.select(next_token, recursive_candidate_generator):
|
||||
if self._select_debug:
|
||||
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
||||
print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
|
||||
yield i
|
||||
if self._select_debug:
|
||||
print '-' * 40
|
||||
print('-' * 40)
|
||||
_use_candidate_generator = recursive_select
|
||||
elif _candidate_generator is None:
|
||||
# By default, a tag's candidates are all of its
|
||||
@@ -1343,7 +1343,7 @@ class Tag(PageElement):
|
||||
check = "[any]"
|
||||
else:
|
||||
check = tag_name
|
||||
print ' Default candidate generator, tag name="%s"' % check
|
||||
print(' Default candidate generator, tag name="%s"' % check)
|
||||
if self._select_debug:
|
||||
# This is redundant with later code, but it stops
|
||||
# a bunch of bogus tags from cluttering up the
|
||||
@@ -1365,8 +1365,8 @@ class Tag(PageElement):
|
||||
new_context_ids = set([])
|
||||
for tag in current_context:
|
||||
if self._select_debug:
|
||||
print " Running candidate generator on %s %s" % (
|
||||
tag.name, repr(tag.attrs))
|
||||
print(" Running candidate generator on %s %s" % (
|
||||
tag.name, repr(tag.attrs)))
|
||||
for candidate in _use_candidate_generator(tag):
|
||||
if not isinstance(candidate, Tag):
|
||||
continue
|
||||
@@ -1381,21 +1381,21 @@ class Tag(PageElement):
|
||||
break
|
||||
if checker is None or result:
|
||||
if self._select_debug:
|
||||
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
|
||||
if id(candidate) not in new_context_ids:
|
||||
# If a tag matches a selector more than once,
|
||||
# don't include it in the context more than once.
|
||||
new_context.append(candidate)
|
||||
new_context_ids.add(id(candidate))
|
||||
elif self._select_debug:
|
||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
|
||||
|
||||
current_context = new_context
|
||||
|
||||
if self._select_debug:
|
||||
print "Final verdict:"
|
||||
print("Final verdict:")
|
||||
for i in current_context:
|
||||
print " %s %s" % (i.name, i.attrs)
|
||||
print(" %s %s" % (i.name, i.attrs))
|
||||
return current_context
|
||||
|
||||
# Old names for backwards compatibility
|
||||
@@ -1439,7 +1439,7 @@ class SoupStrainer(object):
|
||||
else:
|
||||
attrs = kwargs
|
||||
normalized_attrs = {}
|
||||
for key, value in attrs.items():
|
||||
for key, value in list(attrs.items()):
|
||||
normalized_attrs[key] = self._normalize_search_value(value)
|
||||
|
||||
self.attrs = normalized_attrs
|
||||
@@ -1448,7 +1448,7 @@ class SoupStrainer(object):
|
||||
def _normalize_search_value(self, value):
|
||||
# Leave it alone if it's a Unicode string, a callable, a
|
||||
# regular expression, a boolean, or None.
|
||||
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
|
||||
if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
|
||||
or isinstance(value, bool) or value is None):
|
||||
return value
|
||||
|
||||
@@ -1461,7 +1461,7 @@ class SoupStrainer(object):
|
||||
new_value = []
|
||||
for v in value:
|
||||
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
|
||||
and not isinstance(v, unicode)):
|
||||
and not isinstance(v, str)):
|
||||
# This is almost certainly the user's mistake. In the
|
||||
# interests of avoiding infinite loops, we'll let
|
||||
# it through as-is rather than doing a recursive call.
|
||||
@@ -1473,7 +1473,7 @@ class SoupStrainer(object):
|
||||
# Otherwise, convert it into a Unicode string.
|
||||
# The unicode(str()) thing is so this will do the same thing on Python 2
|
||||
# and Python 3.
|
||||
return unicode(str(value))
|
||||
return str(str(value))
|
||||
|
||||
def __str__(self):
|
||||
if self.text:
|
||||
@@ -1527,7 +1527,7 @@ class SoupStrainer(object):
|
||||
found = None
|
||||
# If given a list of items, scan it for a text element that
|
||||
# matches.
|
||||
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
|
||||
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
|
||||
for element in markup:
|
||||
if isinstance(element, NavigableString) \
|
||||
and self.search(element):
|
||||
@@ -1540,7 +1540,7 @@ class SoupStrainer(object):
|
||||
found = self.search_tag(markup)
|
||||
# If it's text, make sure the text matches.
|
||||
elif isinstance(markup, NavigableString) or \
|
||||
isinstance(markup, basestring):
|
||||
isinstance(markup, str):
|
||||
if not self.name and not self.attrs and self._matches(markup, self.text):
|
||||
found = markup
|
||||
else:
|
||||
@@ -1554,7 +1554,7 @@ class SoupStrainer(object):
|
||||
if isinstance(markup, list) or isinstance(markup, tuple):
|
||||
# This should only happen when searching a multi-valued attribute
|
||||
# like 'class'.
|
||||
if (isinstance(match_against, unicode)
|
||||
if (isinstance(match_against, str)
|
||||
and ' ' in match_against):
|
||||
# A bit of a special case. If they try to match "foo
|
||||
# bar" on a multivalue attribute's value, only accept
|
||||
@@ -1589,7 +1589,7 @@ class SoupStrainer(object):
|
||||
# None matches None, False, an empty string, an empty list, and so on.
|
||||
return not match_against
|
||||
|
||||
if isinstance(match_against, unicode):
|
||||
if isinstance(match_against, str):
|
||||
# Exact string match
|
||||
return markup == match_against
|
||||
|
||||
|
||||
+16
-16
@@ -225,14 +225,14 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
@@ -243,7 +243,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
expect = "\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
@@ -285,9 +285,9 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
# A seemingly innocuous document... but it's in Unicode! And
|
||||
# it contains characters that can't be represented in the
|
||||
# encoding found in the declaration! The horror!
|
||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
||||
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
@@ -327,7 +327,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
@@ -337,15 +337,15 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
@@ -354,7 +354,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
@@ -493,15 +493,15 @@ class XMLTreeBuilderSmokeTest(object):
|
||||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
||||
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
str(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
@@ -532,17 +532,17 @@ class XMLTreeBuilderSmokeTest(object):
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
self.assertEqual(str(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
self.assertEqual(str(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
self.assertEqual(str(soup.foo), markup)
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
Reference in New Issue
Block a user