Library update
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
from couchpotato.core.event import fireEvent
|
||||
from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \
|
||||
simplifyString
|
||||
@@ -49,21 +49,21 @@ class Mysterbin(NZBProvider):
|
||||
try:
|
||||
html = BeautifulSoup(data)
|
||||
resultable = html.find('table', attrs = {'class':'t'})
|
||||
for result in resultable.findAll('tr'):
|
||||
for result in resultable.find_all('tr'):
|
||||
|
||||
try:
|
||||
myster_id = result.find('input', attrs = {'class': 'check4nzb'})['value']
|
||||
|
||||
# Age
|
||||
age = ''
|
||||
for temp in result.find('td', attrs = {'class': 'cdetail'}).findAll(text = True):
|
||||
for temp in result.find('td', attrs = {'class': 'cdetail'}).find_all(text = True):
|
||||
if 'days' in temp:
|
||||
age = tryInt(temp.split(' ')[0])
|
||||
break
|
||||
|
||||
# size
|
||||
size = None
|
||||
for temp in result.find('div', attrs = {'class': 'cdetail'}).findAll(text = True):
|
||||
for temp in result.find('div', attrs = {'class': 'cdetail'}).find_all(text = True):
|
||||
if 'gb' in temp.lower() or 'mb' in temp.lower() or 'kb' in temp.lower():
|
||||
size = self.parseSize(temp)
|
||||
break
|
||||
@@ -74,7 +74,7 @@ class Mysterbin(NZBProvider):
|
||||
|
||||
new = {
|
||||
'id': myster_id,
|
||||
'name': ''.join(result.find('span', attrs = {'class': 'cname'}).findAll(text = True)),
|
||||
'name': ''.join(result.find('span', attrs = {'class': 'cname'}).find_all(text = True)),
|
||||
'type': 'nzb',
|
||||
'provider': self.getName(),
|
||||
'age': age,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
from couchpotato.core.event import fireEvent
|
||||
from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \
|
||||
simplifyString
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
from couchpotato.core.event import fireEvent
|
||||
from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \
|
||||
simplifyString
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
from couchpotato.core.event import fireEvent
|
||||
from couchpotato.core.helpers.variable import tryInt, getTitle
|
||||
from couchpotato.core.logger import CPLog
|
||||
@@ -47,14 +47,14 @@ class KickAssTorrents(TorrentProvider):
|
||||
try:
|
||||
html = BeautifulSoup(data)
|
||||
resultdiv = html.find('div', attrs = {'class':'tabs'})
|
||||
for result in resultdiv.findAll('div', recursive = False):
|
||||
for result in resultdiv.find_all('div', recursive = False):
|
||||
if result.get('id').lower() not in cat_ids:
|
||||
continue
|
||||
|
||||
try:
|
||||
|
||||
try:
|
||||
for temp in result.findAll('tr'):
|
||||
for temp in result.find_all('tr'):
|
||||
if temp['class'] is 'firstr' or not temp.get('id'):
|
||||
continue
|
||||
|
||||
@@ -68,15 +68,15 @@ class KickAssTorrents(TorrentProvider):
|
||||
}
|
||||
|
||||
nr = 0
|
||||
for td in temp.findAll('td'):
|
||||
for td in temp.find_all('td'):
|
||||
column_name = table_order[nr]
|
||||
if column_name:
|
||||
|
||||
if column_name is 'name':
|
||||
link = td.find('div', {'class': 'torrentname'}).findAll('a')[1]
|
||||
link = td.find('div', {'class': 'torrentname'}).find_all('a')[1]
|
||||
new['id'] = temp.get('id')[-8:]
|
||||
new['name'] = link.text
|
||||
new['url'] = td.findAll('a', 'idownload')[1]['href']
|
||||
new['url'] = td.find_all('a', 'idownload')[1]['href']
|
||||
if new['url'][:2] == '//':
|
||||
new['url'] = 'http:%s' % new['url']
|
||||
new['score'] = 20 if td.find('a', 'iverif') else 0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import SoupStrainer, BeautifulSoup
|
||||
from bs4 import SoupStrainer, BeautifulSoup
|
||||
from couchpotato.core.helpers.encoding import tryUrlencode
|
||||
from couchpotato.core.helpers.variable import mergeDicts, getTitle
|
||||
from couchpotato.core.logger import CPLog
|
||||
@@ -51,13 +51,13 @@ class HDTrailers(TrailerProvider):
|
||||
|
||||
try:
|
||||
tables = SoupStrainer('div')
|
||||
html = BeautifulSoup(data, parseOnlyThese = tables)
|
||||
result_table = html.findAll('h2', text = re.compile(movie_name))
|
||||
html = BeautifulSoup(data, parse_only = tables)
|
||||
result_table = html.find_all('h2', text = re.compile(movie_name))
|
||||
|
||||
for h2 in result_table:
|
||||
if 'trailer' in h2.lower():
|
||||
parent = h2.parent.parent.parent
|
||||
trailerLinks = parent.findAll('a', text = re.compile('480p|720p|1080p'))
|
||||
trailerLinks = parent.find_all('a', text = re.compile('480p|720p|1080p'))
|
||||
try:
|
||||
for trailer in trailerLinks:
|
||||
results[trailer].insert(0, trailer.parent['href'])
|
||||
@@ -74,11 +74,11 @@ class HDTrailers(TrailerProvider):
|
||||
results = {'480p':[], '720p':[], '1080p':[]}
|
||||
try:
|
||||
tables = SoupStrainer('table')
|
||||
html = BeautifulSoup(data, parseOnlyThese = tables)
|
||||
html = BeautifulSoup(data, parse_only = tables)
|
||||
result_table = html.find('table', attrs = {'class':'bottomTable'})
|
||||
|
||||
|
||||
for tr in result_table.findAll('tr'):
|
||||
for tr in result_table.find_all('tr'):
|
||||
trtext = str(tr).lower()
|
||||
if 'clips' in trtext:
|
||||
break
|
||||
@@ -86,7 +86,7 @@ class HDTrailers(TrailerProvider):
|
||||
nr = 0
|
||||
if 'trailer' not in tr.find('span', 'standardTrailerName').text.lower():
|
||||
continue
|
||||
resolutions = tr.findAll('td', attrs = {'class':'bottomTableResolution'})
|
||||
resolutions = tr.find_all('td', attrs = {'class':'bottomTableResolution'})
|
||||
for res in resolutions:
|
||||
results[str(res.a.contents[0])].insert(0, res.a['href'])
|
||||
nr += 1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
from couchpotato.core.providers.userscript.base import UserscriptBase
|
||||
|
||||
class AlloCine(UserscriptBase):
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
from couchpotato.core.event import fireEvent
|
||||
from couchpotato.core.providers.userscript.base import UserscriptBase
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
355
libs/bs4/__init__.py
Normal file
355
libs/bs4/__init__.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides provides methods and Pythonic idioms that make it easy to
|
||||
navigate, search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.1.0"
|
||||
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
# Used when determining whether a text node is all whitespace and
|
||||
# can be replaced with a single space. A text node that contains
|
||||
# fancy Unicode spaces (usually non-breaking) should be left
|
||||
# alone.
|
||||
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, **kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. You can pass in features='html' "
|
||||
"or features='xml' to get a builder capable of handling "
|
||||
"one or the other.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise ValueError(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.reset()
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
(self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) = (
|
||||
self.builder.prepare_markup(markup, from_encoding))
|
||||
|
||||
try:
|
||||
self._feed()
|
||||
except StopParsing:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.currentData = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
|
||||
def new_string(self, s):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
navigable = NavigableString(s)
|
||||
navigable.setup()
|
||||
return navigable
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise ValueError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise ValueError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.currentData:
|
||||
currentData = u''.join(self.currentData)
|
||||
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
|
||||
not set([tag.name for tag in self.tagStack]).intersection(
|
||||
self.builder.preserve_whitespace_tags)):
|
||||
if '\n' in currentData:
|
||||
currentData = '\n'
|
||||
else:
|
||||
currentData = ' '
|
||||
self.currentData = []
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(currentData)):
|
||||
return
|
||||
o = containerClass(currentData)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o):
|
||||
"""Add an object to the parse tree."""
|
||||
o.setup(self.currentTag, self.previous_element)
|
||||
if self.previous_element:
|
||||
self.previous_element.next_element = o
|
||||
self.previous_element = o
|
||||
self.currentTag.contents.append(o)
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
return
|
||||
|
||||
numPops = 0
|
||||
mostRecentTag = None
|
||||
|
||||
for i in range(len(self.tagStack) - 1, 0, -1):
|
||||
if (name == self.tagStack[i].name
|
||||
and nsprefix == self.tagStack[i].nsprefix == nsprefix):
|
||||
numPops = len(self.tagStack) - i
|
||||
break
|
||||
if not inclusivePop:
|
||||
numPops = numPops - 1
|
||||
|
||||
for i in range(0, numPops):
|
||||
mostRecentTag = self.popTag()
|
||||
return mostRecentTag
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occured
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self.previous_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self.previous_element:
|
||||
self.previous_element.next_element = tag
|
||||
self.previous_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.currentData.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
||||
307
libs/bs4/builder/__init__.py
Normal file
307
libs/bs4/builder/__init__.py
Normal file
@@ -0,0 +1,307 @@
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
whitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), [])
|
||||
for cdata_list_attr in itertools.chain(universal, tag_specific):
|
||||
if cdata_list_attr in dict(attrs):
|
||||
# Basically, we have a "class" attribute whose
|
||||
# value is a whitespace-separated list of CSS
|
||||
# classes. Split it into a list.
|
||||
value = attrs[cdata_list_attr]
|
||||
values = whitespace_re.split(value)
|
||||
attrs[cdata_list_attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = set(['pre', 'textarea'])
|
||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||
'spacer', 'link', 'frame', 'base'])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
||||
222
libs/bs4/builder/_html5lib.py
Normal file
222
libs/bs4/builder/_html5lib.py
Normal file
@@ -0,0 +1,222 @@
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import NamespacedAttribute
|
||||
import html5lib
|
||||
from html5lib.constants import namespaces
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
return markup, None, None, False
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
self.soup, namespaceHTMLElements)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||
|
||||
def __init__(self, soup, namespaceHTMLElements):
|
||||
self.soup = soup
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(html5lib.treebuilders._base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# Concatenate new text onto old text node
|
||||
# XXX This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + node.element)
|
||||
old_element.replace_with(new_element)
|
||||
else:
|
||||
self.element.append(node.element)
|
||||
node.parent = self
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.contents:
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(
|
||||
Element(child, self.soup, namespaces["html"]))
|
||||
else:
|
||||
newParent.appendChild(
|
||||
TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
html5lib.treebuilders._base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
244
libs/bs4/builder/_htmlparser.py
Normal file
244
libs/bs4/builder/_htmlparser.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import (
|
||||
HTMLParser,
|
||||
HTMLParseError,
|
||||
)
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = (
|
||||
major > 3
|
||||
or (major == 3 and minor > 2)
|
||||
or (major == 3 and minor == 2 and release >= 3))
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
def handle_starttag(self, name, attrs):
|
||||
# XXX namespace
|
||||
self.soup.handle_starttag(name, None, None, dict(attrs))
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed.
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
if data.endswith("?") and data.lower().startswith("xml"):
|
||||
# "An XHTML processing instruction using the trailing '?'
|
||||
# will cause the '?' to be included in data." - HTMLParser
|
||||
# docs.
|
||||
#
|
||||
# Strip the question mark so we don't end up with two
|
||||
# question marks.
|
||||
data = data[:-1]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
features = [HTML, STRICT, HTMLPARSER]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT:
|
||||
kwargs['strict'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
return markup, None, None, False
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
return (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
||||
179
libs/bs4/builder/_lxml.py
Normal file
179
libs/bs4/builder/_lxml.py
Normal file
@@ -0,0 +1,179 @@
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import UnicodeDammit
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
return etree.XMLParser(target=self, strip_cdata=False, recover=True)
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
if parser is None:
|
||||
# Use the default parser.
|
||||
parser = self.default_parser
|
||||
if isinstance(parser, collections.Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False)
|
||||
self.parser = parser
|
||||
self.soup = None
|
||||
self.nsmaps = None
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 3-tuple (markup, original encoding, encoding
|
||||
declared within markup).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
return markup, None, None, False
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
return (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, basestring):
|
||||
markup = StringIO(markup)
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
self.parser.feed(data)
|
||||
while data != '':
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if data != '':
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = None
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(nsmap) == 0 and self.nsmaps != None:
|
||||
# There are no new namespaces for this tag, but namespaces
|
||||
# are in play, so we need a separate tag stack to know
|
||||
# when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
if self.nsmaps is None:
|
||||
self.nsmaps = []
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
namespace, name = self._getNsTag(name)
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if self.nsmaps != None:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
if len(self.nsmaps) == 0:
|
||||
# Namespaces are no longer in play, so don't bother keeping
|
||||
# track of the namespace stack.
|
||||
self.nsmaps = None
|
||||
|
||||
def pi(self, target, data):
|
||||
pass
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
||||
792
libs/bs4/dammit.py
Normal file
792
libs/bs4/dammit.py
Normal file
@@ -0,0 +1,792 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This class forces XML data into a standard format (usually to UTF-8 or
|
||||
Unicode). It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It does not rewrite the XML or HTML to reflect a new
|
||||
encoding; that's the tree builder's job.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
import warnings
|
||||
|
||||
# Autodetects character encodings. Very useful.
|
||||
# Download from http://chardet.feedparser.org/
|
||||
# or 'apt-get install python-chardet'
|
||||
# or 'easy_install chardet'
|
||||
try:
|
||||
import chardet
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
chardet = None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
")")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False):
|
||||
self.declared_html_encoding = None
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
|
||||
if markup == '' or isinstance(markup, unicode):
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
new_markup, document_encoding, sniffed_encoding = \
|
||||
self._detectEncoding(markup, is_html)
|
||||
self.markup = new_markup
|
||||
|
||||
u = None
|
||||
if new_markup != markup:
|
||||
# _detectEncoding modified the markup, then converted it to
|
||||
# Unicode and then to UTF-8. So convert it from UTF-8.
|
||||
u = self._convert_from("utf8")
|
||||
self.original_encoding = sniffed_encoding
|
||||
|
||||
if not u:
|
||||
for proposed_encoding in (
|
||||
override_encodings + [document_encoding, sniffed_encoding]):
|
||||
if proposed_encoding is not None:
|
||||
u = self._convert_from(proposed_encoding)
|
||||
if u:
|
||||
break
|
||||
|
||||
# If no luck and we have auto-detection library, try that:
|
||||
if not u and chardet and not isinstance(self.markup, unicode):
|
||||
u = self._convert_from(chardet.detect(self.markup)['encoding'])
|
||||
|
||||
# As a last resort, try utf-8 and windows-1252:
|
||||
if not u:
|
||||
for proposed_encoding in ("utf-8", "windows-1252"):
|
||||
u = self._convert_from(proposed_encoding)
|
||||
if u:
|
||||
break
|
||||
|
||||
# As an absolute last resort, try the encodings again with
|
||||
# character replacement.
|
||||
if not u:
|
||||
for proposed_encoding in (
|
||||
override_encodings + [
|
||||
document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
|
||||
if proposed_encoding != "ascii":
|
||||
u = self._convert_from(proposed_encoding, "replace")
|
||||
if u is not None:
|
||||
warnings.warn(
|
||||
UnicodeWarning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER."))
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# We could at this point force it to ASCII, but that would
|
||||
# destroy so much data that I think giving up is better
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
|
||||
# strip Byte Order Mark (if present)
|
||||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == '\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == '\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == '\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
newdata = unicode(data, encoding, errors)
|
||||
return newdata
|
||||
|
||||
def _detectEncoding(self, xml_data, is_html=False):
|
||||
"""Given a document, tries to detect its XML encoding."""
|
||||
xml_encoding = sniffed_xml_encoding = None
|
||||
try:
|
||||
if xml_data[:4] == b'\x4c\x6f\xa7\x94':
|
||||
# EBCDIC
|
||||
xml_data = self._ebcdic_to_ascii(xml_data)
|
||||
elif xml_data[:4] == b'\x00\x3c\x00\x3f':
|
||||
# UTF-16BE
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
|
||||
and (xml_data[2:4] != b'\x00\x00'):
|
||||
# UTF-16BE with BOM
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x3c\x00\x3f\x00':
|
||||
# UTF-16LE
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
|
||||
(xml_data[2:4] != b'\x00\x00'):
|
||||
# UTF-16LE with BOM
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x00\x00\x00\x3c':
|
||||
# UTF-32BE
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x3c\x00\x00\x00':
|
||||
# UTF-32LE
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x00\x00\xfe\xff':
|
||||
# UTF-32BE with BOM
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\xff\xfe\x00\x00':
|
||||
# UTF-32LE with BOM
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:3] == b'\xef\xbb\xbf':
|
||||
# UTF-8 with BOM
|
||||
sniffed_xml_encoding = 'utf-8'
|
||||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
||||
else:
|
||||
sniffed_xml_encoding = 'ascii'
|
||||
pass
|
||||
except:
|
||||
xml_encoding_match = None
|
||||
xml_encoding_match = xml_encoding_re.match(xml_data)
|
||||
if not xml_encoding_match and is_html:
|
||||
xml_encoding_match = html_meta_re.search(xml_data)
|
||||
if xml_encoding_match is not None:
|
||||
xml_encoding = xml_encoding_match.groups()[0].decode(
|
||||
'ascii').lower()
|
||||
if is_html:
|
||||
self.declared_html_encoding = xml_encoding
|
||||
if sniffed_xml_encoding and \
|
||||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
||||
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||
'utf16', 'u16')):
|
||||
xml_encoding = sniffed_xml_encoding
|
||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
|
||||
or (charset and self._codec(charset.replace("-", ""))) \
|
||||
or (charset and self._codec(charset.replace("-", "_"))) \
|
||||
or charset
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
EBCDIC_TO_ASCII_MAP = None
|
||||
|
||||
def _ebcdic_to_ascii(self, s):
|
||||
c = self.__class__
|
||||
if not c.EBCDIC_TO_ASCII_MAP:
|
||||
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
|
||||
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
|
||||
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
|
||||
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
|
||||
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
|
||||
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
|
||||
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
|
||||
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
|
||||
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
|
||||
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
|
||||
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
|
||||
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
|
||||
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
|
||||
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
|
||||
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
|
||||
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
|
||||
250,251,252,253,254,255)
|
||||
import string
|
||||
c.EBCDIC_TO_ASCII_MAP = string.maketrans(
|
||||
''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
|
||||
return s.translate(c.EBCDIC_TO_ASCII_MAP)
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
||||
1347
libs/bs4/element.py
Normal file
1347
libs/bs4/element.py
Normal file
File diff suppressed because it is too large
Load Diff
515
libs/bs4/testing.py
Normal file
515
libs/bs4/testing.py
Normal file
@@ -0,0 +1,515 @@
|
||||
"""Helper classes for tests."""
|
||||
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
||||
self.assertSoupEquals("<pre> </pre>")
|
||||
self.assertSoupEquals("<textarea> woo </textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
||||
1
libs/certifi/__init__.py
Normal file
1
libs/certifi/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .core import where
|
||||
3338
libs/certifi/cacert.pem
Normal file
3338
libs/certifi/cacert.pem
Normal file
File diff suppressed because it is too large
Load Diff
19
libs/certifi/core.py
Normal file
19
libs/certifi/core.py
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
ceritfi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
def where():
|
||||
f = os.path.split(__file__)[0]
|
||||
|
||||
return os.path.join(f, 'cacert.pem')
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(where())
|
||||
0
libs/guessit/ISO-3166-1_utf8.txt
Normal file → Executable file
0
libs/guessit/ISO-3166-1_utf8.txt
Normal file → Executable file
0
libs/guessit/ISO-639-2_utf-8.txt
Normal file → Executable file
0
libs/guessit/ISO-639-2_utf-8.txt
Normal file → Executable file
2
libs/guessit/__init__.py
Normal file → Executable file
2
libs/guessit/__init__.py
Normal file → Executable file
@@ -18,7 +18,7 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
__version__ = '0.4'
|
||||
__version__ = '0.5-dev'
|
||||
__all__ = ['Guess', 'Language',
|
||||
'guess_file_info', 'guess_video_info',
|
||||
'guess_movie_info', 'guess_episode_info']
|
||||
|
||||
4
libs/guessit/country.py
Normal file → Executable file
4
libs/guessit/country.py
Normal file → Executable file
@@ -20,6 +20,7 @@
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from guessit import fileutils
|
||||
from guessit.textutils import to_unicode
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -66,7 +67,8 @@ class Country(object):
|
||||
"""
|
||||
|
||||
def __init__(self, country, strict=False):
|
||||
self.alpha3 = country_to_alpha3.get(country.lower())
|
||||
country = to_unicode(country.strip().lower())
|
||||
self.alpha3 = country_to_alpha3.get(country)
|
||||
|
||||
if self.alpha3 is None and strict:
|
||||
msg = 'The given string "%s" could not be identified as a country'
|
||||
|
||||
0
libs/guessit/date.py
Normal file → Executable file
0
libs/guessit/date.py
Normal file → Executable file
7
libs/guessit/fileutils.py
Normal file → Executable file
7
libs/guessit/fileutils.py
Normal file → Executable file
@@ -29,6 +29,7 @@ def split_path(path):
|
||||
If the given path was an absolute path, the first element will always be:
|
||||
- the '/' root folder on Unix systems
|
||||
- the drive letter on Windows systems (eg: r'C:\')
|
||||
- the mount point '\\' on Windows systems (eg: r'\\host\share')
|
||||
|
||||
>>> split_path('/usr/bin/smewt')
|
||||
['/', 'usr', 'bin', 'smewt']
|
||||
@@ -36,12 +37,6 @@ def split_path(path):
|
||||
>>> split_path('relative_path/to/my_folder/')
|
||||
['relative_path', 'to', 'my_folder']
|
||||
|
||||
>>> split_path(r'C:\Program Files\Smewt\smewt.exe')
|
||||
['C:\\', 'Program Files', 'Smewt', 'smewt.exe']
|
||||
|
||||
>>> split_path(r'Documents and Settings\User\config\\')
|
||||
['Documents and Settings', 'User', 'config']
|
||||
|
||||
"""
|
||||
result = []
|
||||
while True:
|
||||
|
||||
0
libs/guessit/guess.py
Normal file → Executable file
0
libs/guessit/guess.py
Normal file → Executable file
0
libs/guessit/hash_ed2k.py
Normal file → Executable file
0
libs/guessit/hash_ed2k.py
Normal file → Executable file
0
libs/guessit/hash_mpc.py
Normal file → Executable file
0
libs/guessit/hash_mpc.py
Normal file → Executable file
81
libs/guessit/language.py
Normal file → Executable file
81
libs/guessit/language.py
Normal file → Executable file
@@ -21,11 +21,13 @@
|
||||
from __future__ import unicode_literals
|
||||
from guessit import fileutils
|
||||
from guessit.country import Country
|
||||
from guessit.textutils import to_unicode
|
||||
import re
|
||||
import logging
|
||||
|
||||
__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language',
|
||||
'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'search_language' ]
|
||||
'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED',
|
||||
'search_language' ]
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -46,14 +48,21 @@ _iso639_contents = _iso639_contents[1:]
|
||||
language_matrix = [ l.strip().split('|')
|
||||
for l in _iso639_contents.strip().split('\n') ]
|
||||
|
||||
language_matrix += [ [ 'unk', '', 'un', 'Unknown', 'inconnu' ] ]
|
||||
|
||||
# update information in the language matrix
|
||||
language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'],
|
||||
['ass', '', '', 'Assyrian', 'assyrien']]
|
||||
|
||||
# remove unused languages that shadow other common ones with a non-official form
|
||||
for lang in language_matrix:
|
||||
# remove unused languages that shadow other common ones with a non-official form
|
||||
if (lang[2] == 'se' or # Northern Sami shadows Swedish
|
||||
lang[2] == 'br'): # Breton shadows Brazilian
|
||||
language_matrix.remove(lang)
|
||||
lang[2] = ''
|
||||
# add missing information
|
||||
if lang[0] == 'und':
|
||||
lang[2] = 'un'
|
||||
if lang[0] == 'srp':
|
||||
lang[1] = 'scc' # from OpenSubtitles
|
||||
|
||||
|
||||
lng3 = frozenset(l[0] for l in language_matrix if l[0])
|
||||
@@ -87,12 +96,17 @@ lng_fr_name_to_lng3 = dict((fr_name.lower(), l[0])
|
||||
|
||||
# contains a list of exceptions: strings that should be parsed as a language
|
||||
# but which are not in an ISO form
|
||||
lng_exceptions = { 'gr': ('gre', None),
|
||||
lng_exceptions = { 'unknown': ('und', None),
|
||||
'inconnu': ('und', None),
|
||||
'unk': ('und', None),
|
||||
'un': ('und', None),
|
||||
'gr': ('gre', None),
|
||||
'greek': ('gre', None),
|
||||
'esp': ('spa', None),
|
||||
'español': ('spa', None),
|
||||
'se': ('swe', None),
|
||||
'po': ('pt', 'br'),
|
||||
'pb': ('pt', 'br'),
|
||||
'pob': ('pt', 'br'),
|
||||
'br': ('pt', 'br'),
|
||||
'brazilian': ('pt', 'br'),
|
||||
@@ -101,7 +115,8 @@ lng_exceptions = { 'gr': ('gre', None),
|
||||
'ua': ('ukr', None),
|
||||
'cn': ('chi', None),
|
||||
'chs': ('chi', None),
|
||||
'jp': ('jpn', None)
|
||||
'jp': ('jpn', None),
|
||||
'scr': ('hrv', None)
|
||||
}
|
||||
|
||||
|
||||
@@ -130,6 +145,11 @@ class Language(object):
|
||||
You can also distinguish languages for specific countries, such as
|
||||
Portuguese and Brazilian Portuguese.
|
||||
|
||||
There are various properties on the language object that give you the
|
||||
representation of the language for a specific usage, such as .alpha3
|
||||
to get the ISO 3-letter code, or .opensubtitles to get the OpenSubtitles
|
||||
language code.
|
||||
|
||||
>>> Language('fr')
|
||||
Language(French)
|
||||
|
||||
@@ -146,16 +166,19 @@ class Language(object):
|
||||
True
|
||||
|
||||
>>> Language('zz', strict=False).english_name
|
||||
u'Unknown'
|
||||
u'Undetermined'
|
||||
|
||||
>>> Language('pt(br)').opensubtitles
|
||||
u'pob'
|
||||
"""
|
||||
|
||||
_with_country_regexp = re.compile('(.*)\((.*)\)')
|
||||
_with_country_regexp2 = re.compile('(.*)-(.*)')
|
||||
|
||||
def __init__(self, language, country=None, strict=False):
|
||||
language = language.strip().lower()
|
||||
if isinstance(language, str):
|
||||
language = language.decode('utf-8')
|
||||
with_country = Language._with_country_regexp.match(language)
|
||||
def __init__(self, language, country=None, strict=False, scheme=None):
|
||||
language = to_unicode(language.strip().lower())
|
||||
with_country = (Language._with_country_regexp.match(language) or
|
||||
Language._with_country_regexp2.match(language))
|
||||
if with_country:
|
||||
self.lang = Language(with_country.group(1)).lang
|
||||
self.country = Country(with_country.group(2))
|
||||
@@ -164,6 +187,18 @@ class Language(object):
|
||||
self.lang = None
|
||||
self.country = Country(country) if country else None
|
||||
|
||||
# first look for scheme specific languages
|
||||
if scheme == 'opensubtitles':
|
||||
if language == 'br':
|
||||
self.lang = 'bre'
|
||||
return
|
||||
elif language == 'se':
|
||||
self.lang = 'sme'
|
||||
return
|
||||
elif scheme is not None:
|
||||
log.warning('Unrecognized scheme: "%s" - Proceeding with standard one' % scheme)
|
||||
|
||||
# look for ISO language codes
|
||||
if len(language) == 2:
|
||||
self.lang = lng2_to_lng3.get(language)
|
||||
elif len(language) == 3:
|
||||
@@ -174,6 +209,7 @@ class Language(object):
|
||||
self.lang = (lng_en_name_to_lng3.get(language) or
|
||||
lng_fr_name_to_lng3.get(language))
|
||||
|
||||
# general language exceptions
|
||||
if self.lang is None and language in lng_exceptions:
|
||||
lang, country = lng_exceptions[language]
|
||||
self.lang = Language(lang).alpha3
|
||||
@@ -186,7 +222,7 @@ class Language(object):
|
||||
|
||||
if self.lang is None:
|
||||
log.debug(msg)
|
||||
self.lang = 'unk'
|
||||
self.lang = 'und'
|
||||
|
||||
@property
|
||||
def alpha2(self):
|
||||
@@ -208,6 +244,20 @@ class Language(object):
|
||||
def french_name(self):
|
||||
return lng3_to_lng_fr_name[self.lang]
|
||||
|
||||
@property
|
||||
def opensubtitles(self):
|
||||
if self.lang == 'por' and self.country and self.country.alpha2 == 'br':
|
||||
return 'pob'
|
||||
elif self.lang in ['gre', 'srp']:
|
||||
return self.alpha3term
|
||||
return self.alpha3
|
||||
|
||||
@property
|
||||
def tmdb(self):
|
||||
if self.country:
|
||||
return '%s-%s' % (self.alpha2, self.country.alpha2.upper())
|
||||
return self.alpha2
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.lang)
|
||||
|
||||
@@ -227,7 +277,7 @@ class Language(object):
|
||||
return not self == other
|
||||
|
||||
def __nonzero__(self):
|
||||
return self.lang != 'unk'
|
||||
return self.lang != 'und'
|
||||
|
||||
def __unicode__(self):
|
||||
if self.country:
|
||||
@@ -245,7 +295,8 @@ class Language(object):
|
||||
return 'Language(%s)' % self.english_name
|
||||
|
||||
|
||||
ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([Language('unk')])
|
||||
UNDETERMINED = Language('und')
|
||||
ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([UNDETERMINED])
|
||||
ALL_LANGUAGES_NAMES = lng_all_names
|
||||
|
||||
def search_language(string, lang_filter=None):
|
||||
|
||||
0
libs/guessit/matcher.py
Normal file → Executable file
0
libs/guessit/matcher.py
Normal file → Executable file
0
libs/guessit/matchtree.py
Normal file → Executable file
0
libs/guessit/matchtree.py
Normal file → Executable file
@@ -40,10 +40,10 @@ episode_rexps = [ # ... Season 2 ...
|
||||
(r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),
|
||||
|
||||
# ... s02e13 ...
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).{,3}[EeXx](?P<episodeNumber>[0-9]{1,2})[^0-9]', 1.0, (0, -1)),
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).{,3}(?P<episodeNumber>(?:[EeXx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
|
||||
# ... 2x13 ...
|
||||
(r'[^0-9](?P<season>[0-9]{1,2})x(?P<episodeNumber>[0-9]{2})[^0-9]', 0.8, (1, -1)),
|
||||
(r'[^0-9](?P<season>[0-9]{1,2})(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
|
||||
|
||||
# ... s02 ...
|
||||
#(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
|
||||
@@ -61,7 +61,7 @@ weak_episode_rexps = [ # ... 213 or 0106 ...
|
||||
(sep + r'(?P<episodeNumber>[0-9]{1,4})' + sep, (1, -1)),
|
||||
|
||||
# ... 2x13 ...
|
||||
(sep + r'[^0-9](?P<season>[0-9]{1,2})\.(?P<episodeNumber>[0-9]{2})[^0-9]' + sep, (1, -1)),
|
||||
(sep + r'[^0-9](?P<season>[0-9]{1,2})\.(?P<episodeNumber>[0-9]{1,2})[^0-9]' + sep, (1, -1)),
|
||||
|
||||
# ... e13 ... for a mini-series without a season number
|
||||
(r'e(?P<episodeNumber>[0-9]{1,4})[^0-9]', (0, -1)),
|
||||
|
||||
0
libs/guessit/slogging.py
Normal file → Executable file
0
libs/guessit/slogging.py
Normal file → Executable file
22
libs/guessit/textutils.py
Normal file → Executable file
22
libs/guessit/textutils.py
Normal file → Executable file
@@ -19,6 +19,7 @@
|
||||
#
|
||||
|
||||
from guessit.patterns import sep
|
||||
import unicodedata
|
||||
import copy
|
||||
|
||||
# string-related functions
|
||||
@@ -70,6 +71,7 @@ def to_utf8(o):
|
||||
return [ to_utf8(i) for i in o ]
|
||||
elif isinstance(o, dict):
|
||||
# need to do it like that to handle Guess instances correctly
|
||||
# FIXME: why is that necessary?
|
||||
result = copy.deepcopy(o)
|
||||
for key, value in o.items():
|
||||
result[to_utf8(key)] = to_utf8(value)
|
||||
@@ -78,6 +80,26 @@ def to_utf8(o):
|
||||
else:
|
||||
return o
|
||||
|
||||
def to_unicode(o):
|
||||
"""Convert all strings found in the given object to normalized
|
||||
unicode strings, using the UTF-8 codec if needed."""
|
||||
|
||||
if isinstance(o, unicode):
|
||||
return unicodedata.normalize('NFC', o)
|
||||
if isinstance(o, str):
|
||||
return unicodedata.normalize('NFC', o.decode('utf-8'))
|
||||
elif isinstance(o, list):
|
||||
return [ to_unicode(i) for i in o ]
|
||||
elif isinstance(o, dict):
|
||||
# need to do it like that to handle Guess instances correctly
|
||||
#result = copy.deepcopy(o)
|
||||
for key, value in o.items():
|
||||
result[to_unicode(key)] = to_unicode(value)
|
||||
return result
|
||||
|
||||
else:
|
||||
return o
|
||||
|
||||
|
||||
def levenshtein(a, b):
|
||||
if not a:
|
||||
|
||||
0
libs/guessit/transfo/__init__.py
Normal file → Executable file
0
libs/guessit/transfo/__init__.py
Normal file → Executable file
0
libs/guessit/transfo/guess_bonus_features.py
Normal file → Executable file
0
libs/guessit/transfo/guess_bonus_features.py
Normal file → Executable file
0
libs/guessit/transfo/guess_date.py
Normal file → Executable file
0
libs/guessit/transfo/guess_date.py
Normal file → Executable file
0
libs/guessit/transfo/guess_episode_info_from_position.py
Normal file → Executable file
0
libs/guessit/transfo/guess_episode_info_from_position.py
Normal file → Executable file
23
libs/guessit/transfo/guess_episodes_rexps.py
Normal file → Executable file
23
libs/guessit/transfo/guess_episodes_rexps.py
Normal file → Executable file
@@ -26,14 +26,31 @@ import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def number_list(s):
|
||||
return re.sub('[^0-9]+', ' ', s).split()
|
||||
|
||||
def guess_episodes_rexps(string):
|
||||
for rexp, confidence, span_adjust in episode_rexps:
|
||||
match = re.search(rexp, string, re.IGNORECASE)
|
||||
if match:
|
||||
return (Guess(match.groupdict(), confidence=confidence),
|
||||
(match.start() + span_adjust[0],
|
||||
match.end() + span_adjust[1]))
|
||||
result = (Guess(match.groupdict(), confidence=confidence),
|
||||
(match.start() + span_adjust[0],
|
||||
match.end() + span_adjust[1]))
|
||||
# episodes which have a season > 25 are most likely errors
|
||||
# (Simpsons is at 23!)
|
||||
if int(result[0].get('season', 0)) > 25:
|
||||
continue
|
||||
|
||||
# decide whether we have only a single episode number or an
|
||||
# episode list
|
||||
if result[0].get('episodeNumber'):
|
||||
eplist = number_list(result[0]['episodeNumber'])
|
||||
result[0].set('episodeNumber', int(eplist[0]), confidence=confidence)
|
||||
|
||||
if len(eplist) > 1:
|
||||
result[0].set('episodeList', map(int, eplist), confidence=confidence)
|
||||
|
||||
return result
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
3
libs/guessit/transfo/guess_filetype.py
Normal file → Executable file
3
libs/guessit/transfo/guess_filetype.py
Normal file → Executable file
@@ -87,6 +87,9 @@ def guess_filetype(filename, filetype):
|
||||
upgrade_episode()
|
||||
break
|
||||
|
||||
if 'tvu.org.ru' in filename:
|
||||
upgrade_episode()
|
||||
|
||||
# if no episode info found, assume it's a movie
|
||||
upgrade_movie()
|
||||
|
||||
|
||||
0
libs/guessit/transfo/guess_language.py
Normal file → Executable file
0
libs/guessit/transfo/guess_language.py
Normal file → Executable file
0
libs/guessit/transfo/guess_movie_title_from_position.py
Normal file → Executable file
0
libs/guessit/transfo/guess_movie_title_from_position.py
Normal file → Executable file
0
libs/guessit/transfo/guess_properties.py
Normal file → Executable file
0
libs/guessit/transfo/guess_properties.py
Normal file → Executable file
0
libs/guessit/transfo/guess_release_group.py
Normal file → Executable file
0
libs/guessit/transfo/guess_release_group.py
Normal file → Executable file
0
libs/guessit/transfo/guess_video_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_video_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_weak_episodes_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_weak_episodes_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_website.py
Normal file → Executable file
0
libs/guessit/transfo/guess_website.py
Normal file → Executable file
0
libs/guessit/transfo/guess_year.py
Normal file → Executable file
0
libs/guessit/transfo/guess_year.py
Normal file → Executable file
0
libs/guessit/transfo/post_process.py
Normal file → Executable file
0
libs/guessit/transfo/post_process.py
Normal file → Executable file
0
libs/guessit/transfo/split_explicit_groups.py
Normal file → Executable file
0
libs/guessit/transfo/split_explicit_groups.py
Normal file → Executable file
0
libs/guessit/transfo/split_on_dash.py
Normal file → Executable file
0
libs/guessit/transfo/split_on_dash.py
Normal file → Executable file
0
libs/guessit/transfo/split_path_components.py
Normal file → Executable file
0
libs/guessit/transfo/split_path_components.py
Normal file → Executable file
17
libs/html5lib/__init__.py
Normal file
17
libs/html5lib/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
HTML parsing library based on the WHATWG "HTML5"
|
||||
specification. The parser is designed to be compatible with existing
|
||||
HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage:
|
||||
|
||||
import html5lib
|
||||
f = open("my_document.html")
|
||||
tree = html5lib.parse(f)
|
||||
"""
|
||||
__version__ = "0.95-dev"
|
||||
from html5parser import HTMLParser, parse, parseFragment
|
||||
from treebuilders import getTreeBuilder
|
||||
from treewalkers import getTreeWalker
|
||||
from serializer import serialize
|
||||
3085
libs/html5lib/constants.py
Normal file
3085
libs/html5lib/constants.py
Normal file
File diff suppressed because it is too large
Load Diff
0
libs/html5lib/filters/__init__.py
Normal file
0
libs/html5lib/filters/__init__.py
Normal file
10
libs/html5lib/filters/_base.py
Normal file
10
libs/html5lib/filters/_base.py
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
||||
127
libs/html5lib/filters/formfiller.py
Normal file
127
libs/html5lib/filters/formfiller.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#
|
||||
# The goal is to finally have a form filler where you pass data for
|
||||
# each form, using the algorithm for "Seeding a form with initial values"
|
||||
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
|
||||
#
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class SimpleFilter(_base.Filter):
|
||||
def __init__(self, source, fieldStorage):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.fieldStorage = fieldStorage
|
||||
|
||||
def __iter__(self):
|
||||
field_indices = {}
|
||||
state = None
|
||||
field_name = None
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"].lower()
|
||||
if name == "input":
|
||||
field_name = None
|
||||
field_type = None
|
||||
input_value_index = -1
|
||||
input_checked_index = -1
|
||||
for i,(n,v) in enumerate(token["data"]):
|
||||
n = n.lower()
|
||||
if n == u"name":
|
||||
field_name = v.strip(spaceCharacters)
|
||||
elif n == u"type":
|
||||
field_type = v.strip(spaceCharacters)
|
||||
elif n == u"checked":
|
||||
input_checked_index = i
|
||||
elif n == u"value":
|
||||
input_value_index = i
|
||||
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
|
||||
if field_type in (u"checkbox", u"radio"):
|
||||
if value_list:
|
||||
if token["data"][input_value_index][1] == value:
|
||||
if input_checked_index < 0:
|
||||
token["data"].append((u"checked", u""))
|
||||
field_indices[field_name] = field_index + 1
|
||||
elif input_checked_index >= 0:
|
||||
del token["data"][input_checked_index]
|
||||
|
||||
elif field_type not in (u"button", u"submit", u"reset"):
|
||||
if input_value_index >= 0:
|
||||
token["data"][input_value_index] = (u"value", value)
|
||||
else:
|
||||
token["data"].append((u"value", value))
|
||||
field_indices[field_name] = field_index + 1
|
||||
|
||||
field_type = None
|
||||
field_name = None
|
||||
|
||||
elif name == "textarea":
|
||||
field_type = "textarea"
|
||||
field_name = dict((token["data"])[::-1])["name"]
|
||||
|
||||
elif name == "select":
|
||||
field_type = "select"
|
||||
attributes = dict(token["data"][::-1])
|
||||
field_name = attributes.get("name")
|
||||
is_select_multiple = "multiple" in attributes
|
||||
is_selected_option_found = False
|
||||
|
||||
elif field_type == "select" and field_name and name == "option":
|
||||
option_selected_index = -1
|
||||
option_value = None
|
||||
for i,(n,v) in enumerate(token["data"]):
|
||||
n = n.lower()
|
||||
if n == "selected":
|
||||
option_selected_index = i
|
||||
elif n == "value":
|
||||
option_value = v.strip(spaceCharacters)
|
||||
if option_value is None:
|
||||
raise NotImplementedError("<option>s without a value= attribute")
|
||||
else:
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
if value_list:
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
if (is_select_multiple or not is_selected_option_found) and option_value == value:
|
||||
if option_selected_index < 0:
|
||||
token["data"].append((u"selected", u""))
|
||||
field_indices[field_name] = field_index + 1
|
||||
is_selected_option_found = True
|
||||
elif option_selected_index >= 0:
|
||||
del token["data"][option_selected_index]
|
||||
|
||||
elif field_type is not None and field_name and type == "EndTag":
|
||||
name = token["name"].lower()
|
||||
if name == field_type:
|
||||
if name == "textarea":
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
if value_list:
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
yield {"type": "Characters", "data": value}
|
||||
field_indices[field_name] = field_index + 1
|
||||
|
||||
field_name = None
|
||||
|
||||
elif name == "option" and field_type == "select":
|
||||
pass # TODO: part of "option without value= attribute" processing
|
||||
|
||||
elif field_type == "textarea":
|
||||
continue # ignore token
|
||||
|
||||
yield token
|
||||
62
libs/html5lib/filters/inject_meta_charset.py
Normal file
62
libs/html5lib/filters/inject_meta_charset.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __init__(self, source, encoding):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == u"head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == u"meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
for (namespace,name),value in token["data"].iteritems():
|
||||
if namespace != None:
|
||||
continue
|
||||
elif name.lower() == u'charset':
|
||||
token["data"][(namespace,name)] = self.encoding
|
||||
meta_found = True
|
||||
break
|
||||
elif name == u'http-equiv' and value.lower() == u'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
else:
|
||||
if has_http_equiv_content_type and (None, u"content") in token["data"]:
|
||||
token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == u"head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": u"head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": u"meta",
|
||||
"data": {(None, u"charset"): self.encoding}}
|
||||
yield {"type": "EndTag", "name": u"head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == u"head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": u"meta",
|
||||
"data": {(None, u"charset"): self.encoding}}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
||||
88
libs/html5lib/filters/lint.py
Normal file
88
libs/html5lib/filters/lint.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
import _base
|
||||
from html5lib.constants import cdataElements, rcdataElements, voidElements
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class LintError(Exception): pass
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
contentModelFlag = "PCDATA"
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if type == "StartTag" and name in voidElements:
|
||||
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
|
||||
elif type == "EmptyTag" and name not in voidElements:
|
||||
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
|
||||
if type == "StartTag":
|
||||
open_elements.append(name)
|
||||
for name, value in token["data"]:
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty attribute name"))
|
||||
if not isinstance(value, unicode):
|
||||
raise LintError(_("Attribute value is not a string: %r") % value)
|
||||
if name in cdataElements:
|
||||
contentModelFlag = "CDATA"
|
||||
elif name in rcdataElements:
|
||||
contentModelFlag = "RCDATA"
|
||||
elif name == "plaintext":
|
||||
contentModelFlag = "PLAINTEXT"
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if name in voidElements:
|
||||
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
|
||||
start_name = open_elements.pop()
|
||||
if start_name != name:
|
||||
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
|
||||
contentModelFlag = "PCDATA"
|
||||
|
||||
elif type == "Comment":
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
if not isinstance(data, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % data)
|
||||
if not data:
|
||||
raise LintError(_(u"%s token with empty data") % type)
|
||||
if type == "SpaceCharacters":
|
||||
data = data.strip(spaceCharacters)
|
||||
if data:
|
||||
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
# XXX: what to do with token["data"] ?
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise LintError(_(u"Unknown token type: %s") % type)
|
||||
|
||||
yield token
|
||||
202
libs/html5lib/filters/optionaltags.py
Normal file
202
libs/html5lib/filters/optionaltags.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if (token["data"] or
|
||||
not self.is_optional_start(token["name"], previous, next)):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
yield token
|
||||
else:
|
||||
yield token
|
||||
|
||||
def is_optional_start(self, tagname, previous, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in 'html':
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
# XXX: we also omit the start tag if the head element is empty
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return True
|
||||
elif type == "EndTag":
|
||||
return next["name"] == "head"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return next["name"] not in ('script', 'style')
|
||||
else:
|
||||
return True
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceeded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return next["name"] == "col"
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tbody':
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous['type'] == 'EndTag' and \
|
||||
previous['name'] in ('tbody','thead','tfoot'):
|
||||
return False
|
||||
return next["name"] == 'tr'
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def is_optional_end(self, tagname, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in ('html', 'head', 'body'):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] == tagname
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('dt', 'dd'):
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('dt', 'dd')
|
||||
elif tagname == 'dd':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, article, aside,
|
||||
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||
# there is no more content in the parent element.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return next["name"] in ('address', 'article', 'aside',
|
||||
'blockquote', 'datagrid', 'dialog',
|
||||
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'header', 'hr', 'menu', 'nav', 'ol',
|
||||
'p', 'pre', 'section', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'option':
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if it is immediately followed by an <code>optgroup</code>
|
||||
# element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('option', 'optgroup')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('rt', 'rp'):
|
||||
# An rt element's end tag may be omitted if the rt element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
# An rp element's end tag may be omitted if the rp element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('rt', 'rp')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return next["name"] != 'colgroup'
|
||||
else:
|
||||
return True
|
||||
elif tagname in ('thead', 'tbody'):
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ['tbody', 'tfoot']
|
||||
elif tagname == 'tbody':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tfoot':
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] == 'tbody'
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('td', 'th'):
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('td', 'th')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
return False
|
||||
8
libs/html5lib/filters/sanitizer.py
Normal file
8
libs/html5lib/filters/sanitizer.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import _base
|
||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||
|
||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
||||
def __iter__(self):
|
||||
for token in _base.Filter.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token: yield token
|
||||
41
libs/html5lib/filters/whitespace.py
Normal file
41
libs/html5lib/filters/whitespace.py
Normal file
@@ -0,0 +1,41 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
||||
|
||||
class Filter(_base.Filter):
|
||||
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
preserve += 1
|
||||
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||
# Test on token["data"] above to not introduce spaces where there were not
|
||||
token["data"] = u" "
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
|
||||
yield token
|
||||
|
||||
def collapse_spaces(text):
|
||||
return SPACES_REGEX.sub(' ', text)
|
||||
|
||||
2733
libs/html5lib/html5parser.py
Normal file
2733
libs/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load Diff
177
libs/html5lib/ihatexml.py
Normal file
177
libs/html5lib/ihatexml.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import re
|
||||
|
||||
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||
|
||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||
|
||||
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
|
||||
|
||||
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||
|
||||
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||
|
||||
letter = " | ".join([baseChar, ideographic])
|
||||
|
||||
#Without the
|
||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||
extender])
|
||||
nameFirst = " | ".join([letter, "_"])
|
||||
|
||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||
|
||||
def charStringToList(chars):
|
||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||
rv = []
|
||||
for item in charRanges:
|
||||
foundMatch = False
|
||||
for regexp in (reChar, reCharRange):
|
||||
match = regexp.match(item)
|
||||
if match is not None:
|
||||
rv.append([hexToInt(item) for item in match.groups()])
|
||||
if len(rv[-1]) == 1:
|
||||
rv[-1] = rv[-1]*2
|
||||
foundMatch = True
|
||||
break
|
||||
if not foundMatch:
|
||||
assert len(item) == 1
|
||||
|
||||
rv.append([ord(item)] * 2)
|
||||
rv = normaliseCharList(rv)
|
||||
return rv
|
||||
|
||||
def normaliseCharList(charList):
|
||||
charList = sorted(charList)
|
||||
for item in charList:
|
||||
assert item[1] >= item[0]
|
||||
rv = []
|
||||
i = 0
|
||||
while i < len(charList):
|
||||
j = 1
|
||||
rv.append(charList[i])
|
||||
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
|
||||
rv[-1][1] = charList[i+j][1]
|
||||
j += 1
|
||||
i += j
|
||||
return rv
|
||||
|
||||
#We don't really support characters above the BMP :(
|
||||
max_unicode = int("FFFF", 16)
|
||||
|
||||
def missingRanges(charList):
|
||||
rv = []
|
||||
if charList[0] != 0:
|
||||
rv.append([0, charList[0][0] - 1])
|
||||
for i, item in enumerate(charList[:-1]):
|
||||
rv.append([item[1]+1, charList[i+1][0] - 1])
|
||||
if charList[-1][1] != max_unicode:
|
||||
rv.append([charList[-1][1] + 1, max_unicode])
|
||||
return rv
|
||||
|
||||
def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(escapeRegexp(unichr(item[0])))
|
||||
else:
|
||||
rv.append(escapeRegexp(unichr(item[0])) + "-" +
|
||||
escapeRegexp(unichr(item[1])))
|
||||
return "[%s]"%"".join(rv)
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, "\\" + char)
|
||||
if char in string:
|
||||
print string
|
||||
|
||||
return string
|
||||
|
||||
#output from the above
|
||||
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
def __init__(self, replaceChars = None,
|
||||
dropXmlnsLocalName = False,
|
||||
dropXmlnsAttrNs = False,
|
||||
preventDoubleDashComments = False,
|
||||
preventDashAtCommentEnd = False,
|
||||
replaceFormFeedCharacters = True):
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
|
||||
self.preventDoubleDashComments = preventDoubleDashComments
|
||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||
|
||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||
|
||||
self.replaceCache = {}
|
||||
|
||||
def coerceAttribute(self, name, namespace=None):
|
||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||
#Need a datalosswarning here
|
||||
return None
|
||||
elif (self.dropXmlnsAttrNs and
|
||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||
return None
|
||||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name, namespace=None):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
if self.preventDoubleDashComments:
|
||||
while "--" in data:
|
||||
data = data.replace("--", "- -")
|
||||
return data
|
||||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
data = data.replace("\x0C", " ")
|
||||
#Other non-xml characters
|
||||
return data
|
||||
|
||||
def toXmlName(self, name):
|
||||
nameFirst = name[0]
|
||||
nameRest = name[1:]
|
||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||
if m:
|
||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||
else:
|
||||
nameFirstOutput = nameFirst
|
||||
|
||||
nameRestOutput = nameRest
|
||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||
for char in replaceChars:
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||
return nameFirstOutput + nameRestOutput
|
||||
|
||||
def getReplacementCharacter(self, char):
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
return replacement
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
name = name.replace(item, self.unescapeChar(item))
|
||||
return name
|
||||
|
||||
def escapeChar(self, char):
|
||||
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
|
||||
self.replaceCache[char] = replacement
|
||||
return replacement
|
||||
|
||||
def unescapeChar(self, charcode):
|
||||
return unichr(int(charcode[1:], 16))
|
||||
782
libs/html5lib/inputstream.py
Normal file
782
libs/html5lib/inputstream.py
Normal file
@@ -0,0 +1,782 @@
|
||||
import codecs
|
||||
import re
|
||||
import types
|
||||
import sys
|
||||
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings, ReparseException
|
||||
import utils
|
||||
|
||||
#Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
|
||||
|
||||
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||
|
||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, 0x10FFFF])
|
||||
|
||||
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
||||
|
||||
# Cache for charsUntil()
|
||||
charsUntilRegEx = {}
|
||||
|
||||
class BufferedStream:
|
||||
"""Buffering for streams that do not have buffering of their own
|
||||
|
||||
The buffer is implemented as a list of chunks on the assumption that
|
||||
joining many strings will be slow since it is O(n**2)
|
||||
"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.buffer = []
|
||||
self.position = [-1,0] #chunk number, offset
|
||||
|
||||
def tell(self):
|
||||
pos = 0
|
||||
for chunk in self.buffer[:self.position[0]]:
|
||||
pos += len(chunk)
|
||||
pos += self.position[1]
|
||||
return pos
|
||||
|
||||
def seek(self, pos):
|
||||
assert pos < self._bufferedBytes()
|
||||
offset = pos
|
||||
i = 0
|
||||
while len(self.buffer[i]) < offset:
|
||||
offset -= pos
|
||||
i += 1
|
||||
self.position = [i, offset]
|
||||
|
||||
def read(self, bytes):
|
||||
if not self.buffer:
|
||||
return self._readStream(bytes)
|
||||
elif (self.position[0] == len(self.buffer) and
|
||||
self.position[1] == len(self.buffer[-1])):
|
||||
return self._readStream(bytes)
|
||||
else:
|
||||
return self._readFromBuffer(bytes)
|
||||
|
||||
def _bufferedBytes(self):
|
||||
return sum([len(item) for item in self.buffer])
|
||||
|
||||
def _readStream(self, bytes):
|
||||
data = self.stream.read(bytes)
|
||||
self.buffer.append(data)
|
||||
self.position[0] += 1
|
||||
self.position[1] = len(data)
|
||||
return data
|
||||
|
||||
def _readFromBuffer(self, bytes):
|
||||
remainingBytes = bytes
|
||||
rv = []
|
||||
bufferIndex = self.position[0]
|
||||
bufferOffset = self.position[1]
|
||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||
assert remainingBytes > 0
|
||||
bufferedData = self.buffer[bufferIndex]
|
||||
|
||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||
bytesToRead = remainingBytes
|
||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||
else:
|
||||
bytesToRead = len(bufferedData) - bufferOffset
|
||||
self.position = [bufferIndex, len(bufferedData)]
|
||||
bufferIndex += 1
|
||||
data = rv.append(bufferedData[bufferOffset:
|
||||
bufferOffset + bytesToRead])
|
||||
remainingBytes -= bytesToRead
|
||||
|
||||
bufferOffset = 0
|
||||
|
||||
if remainingBytes:
|
||||
rv.append(self._readStream(remainingBytes))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
|
||||
class HTMLInputStream:
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
_defaultChunkSize = 10240
|
||||
|
||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
"""
|
||||
|
||||
#Craziness
|
||||
if len(u"\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = (codecName(encoding), "certain")
|
||||
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
self.numBytesMeta = 512
|
||||
#Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
#Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if (self.charEncoding[0] is None):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
||||
'replace')
|
||||
|
||||
self.chunk = u""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
self.errors = []
|
||||
|
||||
# number of (complete) lines in previous chunks
|
||||
self.prevNumLines = 0
|
||||
# number of columns in the last line of the previous chunk
|
||||
self.prevNumCols = 0
|
||||
|
||||
#Deal with CR LF and surrogates split over chunk boundaries
|
||||
self._bufferedCharacter = None
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
if isinstance(source, unicode):
|
||||
source = source.encode('utf-8')
|
||||
self.charEncoding = ("utf-8", "certain")
|
||||
try:
|
||||
from io import BytesIO
|
||||
except:
|
||||
# 2to3 converts this line to: from io import StringIO
|
||||
from cStringIO import StringIO as BytesIO
|
||||
stream = BytesIO(source)
|
||||
|
||||
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
|
||||
stream is sys.stdin):
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = self.detectBOM()
|
||||
confidence = "certain"
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding is None and parseMeta:
|
||||
encoding = self.detectEncodingMeta()
|
||||
confidence = "tentative"
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and chardet:
|
||||
confidence = "tentative"
|
||||
try:
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = detector.result['encoding']
|
||||
self.rawStream.seek(0)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
if encoding is None:
|
||||
confidence="tentative"
|
||||
encoding = self.defaultEncoding
|
||||
|
||||
#Substitute for equivalent encodings:
|
||||
encodingSub = {"iso-8859-1":"windows-1252"}
|
||||
|
||||
if encoding.lower() in encodingSub:
|
||||
encoding = encodingSub[encoding.lower()]
|
||||
|
||||
return encoding, confidence
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
newEncoding = codecName(newEncoding)
|
||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
newEncoding = "utf-8"
|
||||
if newEncoding is None:
|
||||
return
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.reset()
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
an encoding can be determined from the BOM return the name of the
|
||||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
string = self.rawStream.read(4)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
parser = EncodingParser(buffer)
|
||||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
encoding = "utf-8"
|
||||
|
||||
return encoding
|
||||
|
||||
def _position(self, offset):
|
||||
chunk = self.chunk
|
||||
nLines = chunk.count(u'\n', 0, offset)
|
||||
positionLine = self.prevNumLines + nLines
|
||||
lastLinePos = chunk.rfind(u'\n', 0, offset)
|
||||
if lastLinePos == -1:
|
||||
positionColumn = self.prevNumCols + offset
|
||||
else:
|
||||
positionColumn = offset - (lastLinePos + 1)
|
||||
return (positionLine, positionColumn)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self._position(self.chunkOffset)
|
||||
return (line+1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
# Read a new chunk from the input stream if necessary
|
||||
if self.chunkOffset >= self.chunkSize:
|
||||
if not self.readChunk():
|
||||
return EOF
|
||||
|
||||
chunkOffset = self.chunkOffset
|
||||
char = self.chunk[chunkOffset]
|
||||
self.chunkOffset = chunkOffset + 1
|
||||
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=None):
|
||||
if chunkSize is None:
|
||||
chunkSize = self._defaultChunkSize
|
||||
|
||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||
|
||||
self.chunk = u""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
|
||||
data = self.dataStream.read(chunkSize)
|
||||
|
||||
#Deal with CR LF and surrogates broken across chunks
|
||||
if self._bufferedCharacter:
|
||||
data = self._bufferedCharacter + data
|
||||
self._bufferedCharacter = None
|
||||
elif not data:
|
||||
# We have no more data, bye-bye stream
|
||||
return False
|
||||
|
||||
if len(data) > 1:
|
||||
lastv = ord(data[-1])
|
||||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
|
||||
|
||||
data = data.replace(u"\r\n", u"\n")
|
||||
data = data.replace(u"\r", u"\n")
|
||||
|
||||
self.chunk = data
|
||||
self.chunkSize = len(data)
|
||||
|
||||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for i in xrange(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
#Someone picked the wrong compile option
|
||||
#You lose
|
||||
skip = False
|
||||
import sys
|
||||
for match in invalid_unicode_re.finditer(data):
|
||||
if skip:
|
||||
continue
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
#Pretty sure there should be endianness issues here
|
||||
if utils.isSurrogatePair(data[pos:pos+2]):
|
||||
#We have a surrogate pair!
|
||||
char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||
pos == len(data) - 1):
|
||||
self.errors.append("invalid-codepoint")
|
||||
else:
|
||||
skip = False
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in 'characters' or EOF. 'characters' must be
|
||||
a container that supports the 'in' method and iteration over its
|
||||
characters.
|
||||
"""
|
||||
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = charsUntilRegEx[(characters, opposite)]
|
||||
except KeyError:
|
||||
if __debug__:
|
||||
for c in characters:
|
||||
assert(ord(c) < 128)
|
||||
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = u"^%s" % regex
|
||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
|
||||
|
||||
rv = []
|
||||
|
||||
while True:
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.chunk, self.chunkOffset)
|
||||
if m is None:
|
||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||
# then stop
|
||||
if self.chunkOffset != self.chunkSize:
|
||||
break
|
||||
else:
|
||||
end = m.end()
|
||||
# If not the whole chunk matched, return everything
|
||||
# up to the part that didn't match
|
||||
if end != self.chunkSize:
|
||||
rv.append(self.chunk[self.chunkOffset:end])
|
||||
self.chunkOffset = end
|
||||
break
|
||||
# If the whole remainder of the chunk matched,
|
||||
# use it all and read the next chunk
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
r = u"".join(rv)
|
||||
return r
|
||||
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
if char is not None:
|
||||
if self.chunkOffset == 0:
|
||||
# unget is called quite rarely, so it's a good idea to do
|
||||
# more work here if it saves a bit of work in the frequently
|
||||
# called char and charsUntil.
|
||||
# So, just prepend the ungotten character onto the current
|
||||
# chunk:
|
||||
self.chunk = char + self.chunk
|
||||
self.chunkSize += 1
|
||||
else:
|
||||
self.chunkOffset -= 1
|
||||
assert self.chunk[self.chunkOffset] == char
|
||||
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an associated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
return str.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
self._position=-1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
p = self._position = self._position + 1
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
return self[p]
|
||||
|
||||
def previous(self):
|
||||
p = self._position
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
self._position = p = p - 1
|
||||
return self[p]
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
self._position = position
|
||||
|
||||
def getPosition(self):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
if self._position >= 0:
|
||||
return self._position
|
||||
else:
|
||||
return None
|
||||
|
||||
position = property(getPosition, setPosition)
|
||||
|
||||
def getCurrentByte(self):
|
||||
return self[self.position]
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharactersBytes):
|
||||
"""Skip past a list of characters"""
|
||||
p = self.position # use property for the error-checking
|
||||
while p < len(self):
|
||||
c = self[p]
|
||||
if c not in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def skipUntil(self, chars):
|
||||
p = self.position
|
||||
while p < len(self):
|
||||
c = self[p]
|
||||
if c in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
p = self.position
|
||||
data = self[p:p+len(bytes)]
|
||||
rv = data.startswith(bytes)
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
def jumpTo(self, bytes):
|
||||
"""Look for the next sequence of bytes matching a given sequence. If
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
newPosition = self[self.position:].find(bytes)
|
||||
if newPosition > -1:
|
||||
# XXX: This is ugly, but I can't see a nicer way to fix this.
|
||||
if self._position == -1:
|
||||
self._position = 0
|
||||
self._position += (newPosition + len(bytes)-1)
|
||||
return True
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
def __init__(self, data):
|
||||
"""string - the data to work on for encoding detection"""
|
||||
self.data = EncodingBytes(data)
|
||||
self.encoding = None
|
||||
|
||||
def getEncoding(self):
|
||||
methodDispatch = (
|
||||
("<!--",self.handleComment),
|
||||
("<meta",self.handleMeta),
|
||||
("</",self.handlePossibleEndTag),
|
||||
("<!",self.handleOther),
|
||||
("<?",self.handleOther),
|
||||
("<",self.handlePossibleStartTag))
|
||||
for byte in self.data:
|
||||
keepParsing = True
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
except StopIteration:
|
||||
keepParsing=False
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
"""Skip over comments"""
|
||||
return self.data.jumpTo("-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharactersBytes:
|
||||
#if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
#We have a valid meta element we want to search for attributes
|
||||
while True:
|
||||
#Try to find the next attribute after the current position
|
||||
attr = self.getAttribute()
|
||||
if attr is None:
|
||||
return True
|
||||
else:
|
||||
if attr[0] == "charset":
|
||||
tentativeEncoding = attr[1]
|
||||
codec = codecName(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
elif attr[0] == "content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
codec = codecName(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
self.data.next()
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
data = self.data
|
||||
if data.currentByte not in asciiLettersBytes:
|
||||
#If the next byte is not an ascii letter either ignore this
|
||||
#fragment (possible start tag case) or treat it according to
|
||||
#handleOther
|
||||
if endTag:
|
||||
data.previous()
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
c = data.skipUntil(spacesAngleBrackets)
|
||||
if c == "<":
|
||||
#return to the first step in the overall "two step" algorithm
|
||||
#reprocessing the < byte
|
||||
data.previous()
|
||||
else:
|
||||
#Read all attributes
|
||||
attr = self.getAttribute()
|
||||
while attr is not None:
|
||||
attr = self.getAttribute()
|
||||
return True
|
||||
|
||||
def handleOther(self):
|
||||
return self.data.jumpTo(">")
|
||||
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
data = self.data
|
||||
# Step 1 (skip chars)
|
||||
c = data.skip(spaceCharactersBytes | frozenset("/"))
|
||||
# Step 2
|
||||
if c in (">", None):
|
||||
return None
|
||||
# Step 3
|
||||
attrName = []
|
||||
attrValue = []
|
||||
#Step 4 attribute name
|
||||
while True:
|
||||
if c == "=" and attrName:
|
||||
break
|
||||
elif c in spaceCharactersBytes:
|
||||
#Step 6!
|
||||
c = data.skip()
|
||||
c = data.next()
|
||||
break
|
||||
elif c in ("/", ">"):
|
||||
return "".join(attrName), ""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
elif c == None:
|
||||
return None
|
||||
else:
|
||||
attrName.append(c)
|
||||
#Step 5
|
||||
c = data.next()
|
||||
#Step 7
|
||||
if c != "=":
|
||||
data.previous()
|
||||
return "".join(attrName), ""
|
||||
#Step 8
|
||||
data.next()
|
||||
#Step 9
|
||||
c = data.skip()
|
||||
#Step 10
|
||||
if c in ("'", '"'):
|
||||
#10.1
|
||||
quoteChar = c
|
||||
while True:
|
||||
#10.2
|
||||
c = data.next()
|
||||
#10.3
|
||||
if c == quoteChar:
|
||||
data.next()
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
#10.4
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
#10.5
|
||||
else:
|
||||
attrValue.append(c)
|
||||
elif c == ">":
|
||||
return "".join(attrName), ""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
# Step 11
|
||||
while True:
|
||||
c = data.next()
|
||||
if c in spacesAngleBrackets:
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
def parse(self):
|
||||
try:
|
||||
#Check if the attr name is charset
|
||||
#otherwise return
|
||||
self.data.jumpTo("charset")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
if not self.data.currentByte == "=":
|
||||
#If there is no = sign keep looking for attrs
|
||||
return None
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
#Look for an encoding between matching quote marks
|
||||
if self.data.currentByte in ('"', "'"):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
if self.data.jumpTo(quoteMark):
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
#Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.skipUntil(spaceCharactersBytes)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
#Return the whole remaining value
|
||||
return self.data[oldPosition:]
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def codecName(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if (encoding is not None and type(encoding) in types.StringTypes):
|
||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
||||
return encodings.get(canonicalName, None)
|
||||
else:
|
||||
return None
|
||||
258
libs/html5lib/sanitizer.py
Normal file
258
libs/html5lib/sanitizer.py
Normal file
@@ -0,0 +1,258 @@
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from tokenizer import HTMLTokenizer
|
||||
from constants import tokenTypes
|
||||
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
||||
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
||||
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
||||
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
||||
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
||||
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
|
||||
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
|
||||
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
||||
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
||||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
|
||||
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
||||
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
|
||||
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
|
||||
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
|
||||
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
|
||||
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
|
||||
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
|
||||
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
|
||||
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
|
||||
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
|
||||
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
|
||||
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
|
||||
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
|
||||
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
|
||||
'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
|
||||
'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
|
||||
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
|
||||
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
|
||||
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
|
||||
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
|
||||
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
|
||||
'width', 'wrap', 'xml:lang']
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
|
||||
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
|
||||
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
|
||||
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
|
||||
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
|
||||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
||||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
|
||||
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
|
||||
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
|
||||
'opacity', 'orient', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
|
||||
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
|
||||
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
|
||||
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
|
||||
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
|
||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
||||
'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||
'xlink:href', 'xml:base']
|
||||
|
||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
||||
'mask', 'stroke']
|
||||
|
||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
|
||||
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
|
||||
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
|
||||
'set', 'use']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
||||
allowed_css_properties = acceptable_css_properties
|
||||
allowed_css_keywords = acceptable_css_keywords
|
||||
allowed_svg_properties = acceptable_svg_properties
|
||||
allowed_protocols = acceptable_protocols
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
|
||||
# accommodate filters which use token_type differently
|
||||
token_type = token["type"]
|
||||
if token_type in tokenTypes.keys():
|
||||
token_type = tokenTypes[token_type]
|
||||
|
||||
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]):
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in
|
||||
token["data"][::-1]
|
||||
if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr):
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
#remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace(u"\ufffd", "")
|
||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
|
||||
(val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
||||
attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
return token
|
||||
else:
|
||||
if token_type == tokenTypes["EndTag"]:
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token.get("selfClosing"):
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
|
||||
if token["type"] in tokenTypes.keys():
|
||||
token["type"] = "Characters"
|
||||
else:
|
||||
token["type"] = tokenTypes["Characters"]
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
elif token_type == tokenTypes["Comment"]:
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
|
||||
|
||||
clean = []
|
||||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||
if not value: continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background','border','margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if not keyword in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
|
||||
#Change case matching defaults as we only output lowercase html anyway
|
||||
#This solution doesn't seem ideal...
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName, parser=parser)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
17
libs/html5lib/serializer/__init__.py
Normal file
17
libs/html5lib/serializer/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
from html5lib import treewalkers
|
||||
|
||||
from htmlserializer import HTMLSerializer
|
||||
from xhtmlserializer import XHTMLSerializer
|
||||
|
||||
def serialize(input, tree="simpletree", format="html", encoding=None,
|
||||
**serializer_opts):
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
if format == "html":
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
elif format == "xhtml":
|
||||
s = XHTMLSerializer(**serializer_opts)
|
||||
else:
|
||||
raise ValueError, "type must be either html or xhtml"
|
||||
return s.render(walker(input), encoding)
|
||||
312
libs/html5lib/serializer/htmlserializer.py
Normal file
312
libs/html5lib/serializer/htmlserializer.py
Normal file
@@ -0,0 +1,312 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from html5lib.constants import rcdataElements, entities, xmlEntities
|
||||
from html5lib import utils
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
try:
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
except ImportError:
|
||||
unicode_encode_errors = "strict"
|
||||
else:
|
||||
unicode_encode_errors = "htmlentityreplace"
|
||||
|
||||
from html5lib.constants import entities
|
||||
|
||||
encode_entity_map = {}
|
||||
is_ucs4 = len(u"\U0010FFFF") == 1
|
||||
for k, v in entities.items():
|
||||
#skip multi-character entities
|
||||
if ((is_ucs4 and len(v) > 1) or
|
||||
(not is_ucs4 and len(v) > 2)):
|
||||
continue
|
||||
if v != "&":
|
||||
if len(v) == 2:
|
||||
v = utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
try:
|
||||
v = ord(v)
|
||||
except:
|
||||
print v
|
||||
raise
|
||||
if not v in encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
|
||||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append("&#x%s;"%(hex(cp)[2:]))
|
||||
return (u"".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
||||
|
||||
del register_error
|
||||
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
# attribute quoting options
|
||||
quote_attr_values = False
|
||||
quote_char = u'"'
|
||||
use_best_quote_char = True
|
||||
|
||||
# tag syntax options
|
||||
omit_optional_tags = True
|
||||
minimize_boolean_attributes = True
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
|
||||
# escaping options
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
resolve_entities = True
|
||||
|
||||
# miscellaneous options
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||
"space_before_trailing_solidus", "omit_optional_tags",
|
||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||
"escape_rcdata", "resolve_entities", "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize HTMLSerializer.
|
||||
|
||||
Keyword options (default given first unless specified) include:
|
||||
|
||||
inject_meta_charset=True|False
|
||||
Whether it insert a meta element to define the character set of the
|
||||
document.
|
||||
quote_attr_values=True|False
|
||||
Whether to quote attribute values that don't require quoting
|
||||
per HTML5 parsing rules.
|
||||
quote_char=u'"'|u"'"
|
||||
Use given quote character for attribute quoting. Default is to
|
||||
use double quote unless attribute value contains a double quote,
|
||||
in which case single quotes are used instead.
|
||||
escape_lt_in_attrs=False|True
|
||||
Whether to escape < in attribute values.
|
||||
escape_rcdata=False|True
|
||||
Whether to escape characters that need to be escaped within normal
|
||||
elements within rcdata elements such as style.
|
||||
resolve_entities=True|False
|
||||
Whether to resolve named character entities that appear in the
|
||||
source tree. The XML predefined entities < > & " '
|
||||
are unaffected by this setting.
|
||||
strip_whitespace=False|True
|
||||
Whether to remove semantically meaningless whitespace. (This
|
||||
compresses all whitespace to a single space except within pre.)
|
||||
minimize_boolean_attributes=True|False
|
||||
Shortens boolean attributes to give just the attribute value,
|
||||
for example <input disabled="disabled"> becomes <input disabled>.
|
||||
use_trailing_solidus=False|True
|
||||
Includes a close-tag slash at the end of the start tag of void
|
||||
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
||||
space_before_trailing_solidus=True|False
|
||||
Places a space immediately before the closing slash in a tag
|
||||
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
||||
sanitize=False|True
|
||||
Strip all unsafe or unknown constructs from output.
|
||||
See `html5lib user documentation`_
|
||||
omit_optional_tags=True|False
|
||||
Omit start/end tags that are optional.
|
||||
|
||||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
||||
"""
|
||||
if kwargs.has_key('quote_char'):
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||
self.errors = []
|
||||
self.strict = False
|
||||
|
||||
def encode(self, string):
|
||||
assert(isinstance(string, unicode))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, unicode_encode_errors)
|
||||
else:
|
||||
return string
|
||||
|
||||
def encodeStrict(self, string):
|
||||
assert(isinstance(string, unicode))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, "strict")
|
||||
else:
|
||||
return string
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
self.encoding = encoding
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
if encoding and self.inject_meta_charset:
|
||||
from html5lib.filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
from html5lib.filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from html5lib.filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
from html5lib.filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = u"<!DOCTYPE %s" % token["name"]
|
||||
|
||||
if token["publicId"]:
|
||||
doctype += u' PUBLIC "%s"' % token["publicId"]
|
||||
elif token["systemId"]:
|
||||
doctype += u" SYSTEM"
|
||||
if token["systemId"]:
|
||||
if token["systemId"].find(u'"') >= 0:
|
||||
if token["systemId"].find(u"'") >= 0:
|
||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
||||
quote_char = u"'"
|
||||
else:
|
||||
quote_char = u'"'
|
||||
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||
|
||||
doctype += u">"
|
||||
yield self.encodeStrict(doctype)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError(_("Unexpected </ in CDATA"))
|
||||
yield self.encode(token["data"])
|
||||
else:
|
||||
yield self.encode(escape(token["data"]))
|
||||
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
yield self.encodeStrict(u"<%s" % name)
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
attributes = []
|
||||
for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
|
||||
#TODO: Add namespace support here
|
||||
k = attr_name
|
||||
v = attr_value
|
||||
yield self.encodeStrict(u' ')
|
||||
|
||||
yield self.encodeStrict(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple()) \
|
||||
and k not in booleanAttributes.get("", tuple())):
|
||||
yield self.encodeStrict(u"=")
|
||||
if self.quote_attr_values or not v:
|
||||
quote_attr = True
|
||||
else:
|
||||
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||
spaceCharacters + u">\"'=", False)
|
||||
v = v.replace(u"&", u"&")
|
||||
if self.escape_lt_in_attrs: v = v.replace(u"<", u"<")
|
||||
if quote_attr:
|
||||
quote_char = self.quote_char
|
||||
if self.use_best_quote_char:
|
||||
if u"'" in v and u'"' not in v:
|
||||
quote_char = u'"'
|
||||
elif u'"' in v and u"'" not in v:
|
||||
quote_char = u"'"
|
||||
if quote_char == u"'":
|
||||
v = v.replace(u"'", u"'")
|
||||
else:
|
||||
v = v.replace(u'"', u""")
|
||||
yield self.encodeStrict(quote_char)
|
||||
yield self.encode(v)
|
||||
yield self.encodeStrict(quote_char)
|
||||
else:
|
||||
yield self.encode(v)
|
||||
if name in voidElements and self.use_trailing_solidus:
|
||||
if self.space_before_trailing_solidus:
|
||||
yield self.encodeStrict(u" /")
|
||||
else:
|
||||
yield self.encodeStrict(u"/")
|
||||
yield self.encode(u">")
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
yield self.encodeStrict(u"</%s>" % name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError(_("Comment contains --"))
|
||||
yield self.encodeStrict(u"<!--%s-->" % token["data"])
|
||||
|
||||
elif type == "Entity":
|
||||
name = token["name"]
|
||||
key = name + ";"
|
||||
if not key in entities:
|
||||
self.serializeError(_("Entity %s not recognized" % name))
|
||||
if self.resolve_entities and key not in xmlEntities:
|
||||
data = entities[key]
|
||||
else:
|
||||
data = u"&%s;" % name
|
||||
yield self.encodeStrict(data)
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
def render(self, treewalker, encoding=None):
|
||||
if encoding:
|
||||
return "".join(list(self.serialize(treewalker, encoding)))
|
||||
else:
|
||||
return u"".join(list(self.serialize(treewalker)))
|
||||
|
||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
if self.strict:
|
||||
raise SerializeError
|
||||
|
||||
def SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
||||
9
libs/html5lib/serializer/xhtmlserializer.py
Normal file
9
libs/html5lib/serializer/xhtmlserializer.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from htmlserializer import HTMLSerializer
|
||||
|
||||
class XHTMLSerializer(HTMLSerializer):
|
||||
quote_attr_values = True
|
||||
minimize_boolean_attributes = False
|
||||
use_trailing_solidus = True
|
||||
escape_lt_in_attrs = True
|
||||
omit_optional_tags = False
|
||||
escape_rcdata = True
|
||||
1744
libs/html5lib/tokenizer.py
Normal file
1744
libs/html5lib/tokenizer.py
Normal file
File diff suppressed because it is too large
Load Diff
96
libs/html5lib/treebuilders/__init__.py
Executable file
96
libs/html5lib/treebuilders/__init__.py
Executable file
@@ -0,0 +1,96 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
treeBuilderCache = {}
|
||||
|
||||
import sys
|
||||
|
||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
||||
|
||||
treeType - the name of the tree type required (case-insensitive). Supported
|
||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - A generic builder for DOM implementations, defaulting to
|
||||
a xml.dom.minidom based implementation for the sake of
|
||||
backwards compatibility (as releases up until 0.10 had a
|
||||
builder called "dom" that was a minidom implemenation).
|
||||
"etree" - A generic builder for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
|
||||
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
||||
module implementing the tree type e.g.
|
||||
xml.etree.ElementTree or lxml.etree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeBuilderCache:
|
||||
if treeType == "dom":
|
||||
import dom
|
||||
# XXX: Keep backwards compatibility by using minidom if no implementation is given
|
||||
if implementation == None:
|
||||
from xml.dom import minidom
|
||||
implementation = minidom
|
||||
# XXX: NEVER cache here, caching is done in the dom submodule
|
||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||
elif treeType == "simpletree":
|
||||
import simpletree
|
||||
treeBuilderCache[treeType] = simpletree.TreeBuilder
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||
elif treeType == "lxml":
|
||||
import etree_lxml
|
||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
# Come up with a sane default
|
||||
if implementation == None:
|
||||
try:
|
||||
import xml.etree.cElementTree as ET
|
||||
except ImportError:
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
except ImportError:
|
||||
try:
|
||||
import cElementTree as ET
|
||||
except ImportError:
|
||||
import elementtree.ElementTree as ET
|
||||
implementation = ET
|
||||
import etree
|
||||
# NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
else:
|
||||
raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
|
||||
return treeBuilderCache.get(treeType)
|
||||
377
libs/html5lib/treebuilders/_base.py
Executable file
377
libs/html5lib/treebuilders/_base.py
Executable file
@@ -0,0 +1,377 @@
|
||||
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
# The scope markers are inserted when entering object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, object elements, and marquees.
|
||||
Marker = None
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, name):
|
||||
"""Node representing an item in the tree.
|
||||
name - The tag name associated with the node
|
||||
parent - The parent of the current node (or None for the document node)
|
||||
value - The value of the current node (applies to text nodes and
|
||||
comments
|
||||
attributes - a dict holding name, value pairs for attributes of the node
|
||||
childNodes - a list of child nodes of the current node. This must
|
||||
include all elements but not necessarily other node types
|
||||
_flags - A list of miscellaneous flags that can be set on the node
|
||||
"""
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self.value = None
|
||||
self.attributes = {}
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __unicode__(self):
|
||||
attributesStr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in
|
||||
self.attributes.iteritems()])
|
||||
if attributesStr:
|
||||
return "<%s %s>"%(self.name,attributesStr)
|
||||
else:
|
||||
return "<%s>"%(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s>" % (self.name)
|
||||
|
||||
def appendChild(self, node):
|
||||
"""Insert node as a child of the current node
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
"""Insert data as text in the current node, positioned before the
|
||||
start of node insertBefore or to the end of the node's text.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
"""Insert node as a child of the current node, before refNode in the
|
||||
list of child nodes. Raises ValueError if refNode is not a child of
|
||||
the current node"""
|
||||
raise NotImplementedError
|
||||
|
||||
def removeChild(self, node):
|
||||
"""Remove node from the children of the current node
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
"""Move all the children of the current node to newParent.
|
||||
This is needed so that trees that don't store text as nodes move the
|
||||
text in the correct way
|
||||
"""
|
||||
#XXX - should this method be made more general?
|
||||
for child in self.childNodes:
|
||||
newParent.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def cloneNode(self):
|
||||
"""Return a shallow copy of the current node i.e. a node with the same
|
||||
name and attributes but with no parent or child nodes
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
class ActiveFormattingElements(list):
|
||||
def append(self, node):
|
||||
equalCount = 0
|
||||
if node != Marker:
|
||||
for element in self[::-1]:
|
||||
if element == Marker:
|
||||
break
|
||||
if self.nodesEqual(element, node):
|
||||
equalCount += 1
|
||||
if equalCount == 3:
|
||||
self.remove(element)
|
||||
break
|
||||
list.append(self, node)
|
||||
|
||||
def nodesEqual(self, node1, node2):
|
||||
if not node1.nameTuple == node2.nameTuple:
|
||||
return False
|
||||
|
||||
if not node1.attributes == node2.attributes:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Base treebuilder implementation
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
"""
|
||||
|
||||
#Document class
|
||||
documentClass = None
|
||||
|
||||
#The class to use for creating a node
|
||||
elementClass = None
|
||||
|
||||
#The class to use for creating comments
|
||||
commentClass = None
|
||||
|
||||
#The class to use for creating doctypes
|
||||
doctypeClass = None
|
||||
|
||||
#Fragment class
|
||||
fragmentClass = None
|
||||
|
||||
def __init__(self, namespaceHTMLElements):
|
||||
if namespaceHTMLElements:
|
||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||
else:
|
||||
self.defaultNamespace = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.openElements = []
|
||||
self.activeFormattingElements = ActiveFormattingElements()
|
||||
|
||||
#XXX - rename these to headElement, formElement
|
||||
self.headPointer = None
|
||||
self.formPointer = None
|
||||
|
||||
self.insertFromTable = False
|
||||
|
||||
self.document = self.documentClass()
|
||||
|
||||
def elementInScope(self, target, variant=None):
|
||||
|
||||
#If we pass a node in we match that. if we pass a string
|
||||
#match any node with that name
|
||||
exactNode = hasattr(target, "nameTuple")
|
||||
|
||||
listElementsMap = {
|
||||
None:(scopingElements, False),
|
||||
"button":(scopingElements | set([(namespaces["html"], "button")]), False),
|
||||
"list":(scopingElements | set([(namespaces["html"], "ol"),
|
||||
(namespaces["html"], "ul")]), False),
|
||||
"table":(set([(namespaces["html"], "html"),
|
||||
(namespaces["html"], "table")]), False),
|
||||
"select":(set([(namespaces["html"], "optgroup"),
|
||||
(namespaces["html"], "option")]), True)
|
||||
}
|
||||
listElements, invert = listElementsMap[variant]
|
||||
|
||||
for node in reversed(self.openElements):
|
||||
if (node.name == target and not exactNode or
|
||||
node == target and exactNode):
|
||||
return True
|
||||
elif (invert ^ (node.nameTuple in listElements)):
|
||||
return False
|
||||
|
||||
assert False # We should never reach this point
|
||||
|
||||
def reconstructActiveFormattingElements(self):
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
if not self.activeFormattingElements:
|
||||
return
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = len(self.activeFormattingElements) - 1
|
||||
entry = self.activeFormattingElements[i]
|
||||
if entry == Marker or entry in self.openElements:
|
||||
return
|
||||
|
||||
# Step 6
|
||||
while entry != Marker and entry not in self.openElements:
|
||||
if i == 0:
|
||||
#This will be reset to 0 below
|
||||
i = -1
|
||||
break
|
||||
i -= 1
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
entry = self.activeFormattingElements[i]
|
||||
|
||||
while True:
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
entry = self.activeFormattingElements[i]
|
||||
clone = entry.cloneNode() #Mainly to get a new copy of the attributes
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement({"type":"StartTag",
|
||||
"name":clone.name,
|
||||
"namespace":clone.namespace,
|
||||
"data":clone.attributes})
|
||||
|
||||
# Step 10
|
||||
self.activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
if element == self.activeFormattingElements[-1]:
|
||||
break
|
||||
|
||||
def clearActiveFormattingElements(self):
|
||||
entry = self.activeFormattingElements.pop()
|
||||
while self.activeFormattingElements and entry != Marker:
|
||||
entry = self.activeFormattingElements.pop()
|
||||
|
||||
def elementInActiveFormattingElements(self, name):
|
||||
"""Check if an element exists between the end of the active
|
||||
formatting elements and the last marker. If it does, return it, else
|
||||
return false"""
|
||||
|
||||
for item in self.activeFormattingElements[::-1]:
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
if item == Marker:
|
||||
break
|
||||
elif item.name == name:
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertRoot(self, token):
|
||||
element = self.createElement(token)
|
||||
self.openElements.append(element)
|
||||
self.document.appendChild(element)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.document.appendChild(doctype)
|
||||
|
||||
def insertComment(self, token, parent=None):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
parent.appendChild(self.commentClass(token["data"]))
|
||||
|
||||
def createElement(self, token):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
return element
|
||||
|
||||
def _getInsertFromTable(self):
|
||||
return self._insertFromTable
|
||||
|
||||
def _setInsertFromTable(self, value):
|
||||
"""Switch the function used to insert an element from the
|
||||
normal one to the misnested table one and back again"""
|
||||
self._insertFromTable = value
|
||||
if value:
|
||||
self.insertElement = self.insertElementTable
|
||||
else:
|
||||
self.insertElement = self.insertElementNormal
|
||||
|
||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||
|
||||
def insertElementNormal(self, token):
|
||||
name = token["name"]
|
||||
assert type(name) == unicode, "Element %s not unicode"%name
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
self.openElements[-1].appendChild(element)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertElementTable(self, token):
|
||||
"""Create an element and insert it into the tree"""
|
||||
element = self.createElement(token)
|
||||
if self.openElements[-1].name not in tableInsertModeElements:
|
||||
return self.insertElementNormal(token)
|
||||
else:
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
if insertBefore is None:
|
||||
parent.appendChild(element)
|
||||
else:
|
||||
parent.insertBefore(element, insertBefore)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
"""Insert text data."""
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
|
||||
if (not self.insertFromTable or (self.insertFromTable and
|
||||
self.openElements[-1].name
|
||||
not in tableInsertModeElements)):
|
||||
parent.insertText(data)
|
||||
else:
|
||||
# We should be in the InTable mode. This means we want to do
|
||||
# special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
parent.insertText(data, insertBefore)
|
||||
|
||||
def getTableMisnestedNodePosition(self):
|
||||
"""Get the foster parent element, and sibling to insert before
|
||||
(or None) when inserting a misnested table node"""
|
||||
# The foster parent element is the one which comes before the most
|
||||
# recently opened table element
|
||||
# XXX - this is really inelegant
|
||||
lastTable=None
|
||||
fosterParent = None
|
||||
insertBefore = None
|
||||
for elm in self.openElements[::-1]:
|
||||
if elm.name == "table":
|
||||
lastTable = elm
|
||||
break
|
||||
if lastTable:
|
||||
# XXX - we should really check that this parent is actually a
|
||||
# node here
|
||||
if lastTable.parent:
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else:
|
||||
fosterParent = self.openElements[
|
||||
self.openElements.index(lastTable) - 1]
|
||||
else:
|
||||
fosterParent = self.openElements[0]
|
||||
return fosterParent, insertBefore
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
# XXX td, th and tr are not actually needed
|
||||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
|
||||
and name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
||||
self.generateImpliedEndTags(exclude)
|
||||
|
||||
def getDocument(self):
|
||||
"Return the final tree"
|
||||
return self.document
|
||||
|
||||
def getFragment(self):
|
||||
"Return the final fragment"
|
||||
#assert self.innerHTML
|
||||
fragment = self.fragmentClass()
|
||||
self.openElements[0].reparentChildren(fragment)
|
||||
return fragment
|
||||
|
||||
def testSerializer(self, node):
|
||||
"""Serialize the subtree of node in the format required by unit tests
|
||||
node - the node from which to start serializing"""
|
||||
raise NotImplementedError
|
||||
291
libs/html5lib/treebuilders/dom.py
Normal file
291
libs/html5lib/treebuilders/dom.py
Normal file
@@ -0,0 +1,291 @@
|
||||
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
try:
|
||||
from types import ModuleType
|
||||
except:
|
||||
from new import module as ModuleType
|
||||
import re
|
||||
import weakref
|
||||
|
||||
import _base
|
||||
from html5lib import constants, ihatexml
|
||||
from html5lib.constants import namespaces
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getDomModule(DomImplementation):
|
||||
name = "_" + DomImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = ModuleType(name)
|
||||
objs = getDomBuilder(DomImplementation)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getDomBuilder(DomImplementation):
|
||||
Dom = DomImplementation
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
def __iter__(self):
|
||||
return self.element.attributes.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
self.element.setAttribute(name, value)
|
||||
def __len__(self):
|
||||
return len(self.element.attributes.items())
|
||||
def items(self):
|
||||
return [(item[0], item[1]) for item in
|
||||
self.element.attributes.items()]
|
||||
def keys(self):
|
||||
return self.element.attributes.keys()
|
||||
def __getitem__(self, name):
|
||||
return self.element.getAttribute(name)
|
||||
|
||||
def __contains__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.hasAttribute(name)
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __init__(self, element):
|
||||
_base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
|
||||
and self.element.namespaceURI or None)
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
else:
|
||||
self.element.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
self.element.insertBefore(node.element, refNode.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
if node.element.parentNode == self.element:
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.hasChildNodes():
|
||||
child = self.element.firstChild
|
||||
self.element.removeChild(child)
|
||||
newParent.element.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
if isinstance(name, tuple):
|
||||
if name[0] is not None:
|
||||
qualifiedName = (name[0] + ":" + name[1])
|
||||
else:
|
||||
qualifiedName = name[1]
|
||||
self.element.setAttributeNS(name[2], qualifiedName,
|
||||
value)
|
||||
else:
|
||||
self.element.setAttribute(
|
||||
name, value)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
return NodeBuilder(self.element.cloneNode(False))
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.hasChildNodes()
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
|
||||
return weakref.proxy(self)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
domimpl = Dom.getDOMImplementation()
|
||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
if Dom == minidom:
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
def elementClass(self, name, namespace=None):
|
||||
if namespace is None and self.defaultNamespace is None:
|
||||
node = self.dom.createElement(name)
|
||||
else:
|
||||
node = self.dom.createElementNS(namespace, name)
|
||||
|
||||
return NodeBuilder(node)
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
|
||||
def fragmentClass(self):
|
||||
return NodeBuilder(self.dom.createDocumentFragment())
|
||||
|
||||
def appendChild(self, node):
|
||||
self.dom.appendChild(node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.dom
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data=data
|
||||
if parent <> self:
|
||||
_base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
||||
self.dom._child_node_types=list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
self.dom.appendChild(self.dom.createTextNode(data))
|
||||
|
||||
name = None
|
||||
|
||||
def testSerializer(element):
|
||||
element.normalize()
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
if element.name:
|
||||
if element.publicId or element.systemId:
|
||||
publicId = element.publicId or ""
|
||||
systemId = element.systemId or ""
|
||||
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
|
||||
' '*indent, element.name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document-fragment")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||
else:
|
||||
if (hasattr(element, "namespaceURI") and
|
||||
element.namespaceURI != None):
|
||||
name = "%s %s"%(constants.prefixes[element.namespaceURI],
|
||||
element.nodeName)
|
||||
else:
|
||||
name = element.nodeName
|
||||
rv.append("|%s<%s>"%(' '*indent, name))
|
||||
if element.hasAttributes():
|
||||
attributes = []
|
||||
for i in range(len(element.attributes)):
|
||||
attr = element.attributes.item(i)
|
||||
name = attr.nodeName
|
||||
value = attr.value
|
||||
ns = attr.namespaceURI
|
||||
if ns:
|
||||
name = "%s %s"%(constants.prefixes[ns], attr.localName)
|
||||
else:
|
||||
name = attr.nodeName
|
||||
attributes.append((name, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
for child in element.childNodes:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
if node.nodeType == Node.ELEMENT_NODE:
|
||||
if not nsmap:
|
||||
handler.startElement(node.nodeName, node.attributes)
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endElement(node.nodeName)
|
||||
else:
|
||||
attributes = dict(node.attributes.itemsNS())
|
||||
|
||||
# gather namespace declarations
|
||||
prefixes = []
|
||||
for attrname in node.attributes.keys():
|
||||
attr = node.getAttributeNode(attrname)
|
||||
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
||||
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
||||
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
|
||||
handler.startPrefixMapping(prefix, attr.nodeValue)
|
||||
prefixes.append(prefix)
|
||||
nsmap = nsmap.copy()
|
||||
nsmap[prefix] = attr.nodeValue
|
||||
del attributes[(attr.namespaceURI, attr.nodeName)]
|
||||
|
||||
# apply namespace declarations
|
||||
for attrname in node.attributes.keys():
|
||||
attr = node.getAttributeNode(attrname)
|
||||
if attr.namespaceURI == None and ':' in attr.nodeName:
|
||||
prefix = attr.nodeName.split(':')[0]
|
||||
if nsmap.has_key(prefix):
|
||||
del attributes[(attr.namespaceURI, attr.nodeName)]
|
||||
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
|
||||
|
||||
# SAX events
|
||||
ns = node.namespaceURI or nsmap.get(None,None)
|
||||
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
for prefix in prefixes: handler.endPrefixMapping(prefix)
|
||||
|
||||
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
|
||||
handler.characters(node.nodeValue)
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||
handler.startDocument()
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
|
||||
else:
|
||||
# ATTRIBUTE_NODE
|
||||
# ENTITY_NODE
|
||||
# PROCESSING_INSTRUCTION_NODE
|
||||
# COMMENT_NODE
|
||||
# DOCUMENT_TYPE_NODE
|
||||
# NOTATION_NODE
|
||||
pass
|
||||
|
||||
return locals()
|
||||
|
||||
# Keep backwards compatibility with things that directly load
|
||||
# classes/functions from this module
|
||||
for key, value in getDomModule(minidom).__dict__.items():
|
||||
globals()[key] = value
|
||||
344
libs/html5lib/treebuilders/etree.py
Executable file
344
libs/html5lib/treebuilders/etree.py
Executable file
@@ -0,0 +1,344 @@
|
||||
try:
|
||||
from types import ModuleType
|
||||
except:
|
||||
from new import module as ModuleType
|
||||
import re
|
||||
import types
|
||||
|
||||
import _base
|
||||
from html5lib import ihatexml
|
||||
from html5lib import constants
|
||||
from html5lib.constants import namespaces
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation, fullTree=False):
|
||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder")
|
||||
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
ElementTree = ElementTreeImplementation
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name, namespace=None):
|
||||
self._name = name
|
||||
self._namespace = namespace
|
||||
self._element = ElementTree.Element(self._getETreeTag(name,
|
||||
namespace))
|
||||
if namespace is None:
|
||||
self.nameTuple = namespaces["html"], self._name
|
||||
else:
|
||||
self.nameTuple = self._namespace, self._name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getETreeTag(self, name, namespace):
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s"%(namespace, name)
|
||||
return etree_tag
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = name
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return self._name
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _setNamespace(self, namespace):
|
||||
self._namespace = namespace
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getNamespace(self):
|
||||
return self._namespace
|
||||
|
||||
namespace = property(_getNamespace, _setNamespace)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s"%(key[2], key[1])
|
||||
else:
|
||||
name = key
|
||||
self._element.set(name, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or len(self._element))
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = list(self._element).index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = list(self._element)
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index-1].tail:
|
||||
self._element[index-1].tail = ""
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = type(self)(self.name, self.namespace)
|
||||
for name, value in self.attributes.iteritems():
|
||||
element.attributes[name] = value
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
#Use the superclass constructor to set all properties on the
|
||||
#wrapper element
|
||||
self._element = ElementTree.Comment(data)
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
def _getPublicId(self):
|
||||
return self._element.get(u"publicId", "")
|
||||
|
||||
def _setPublicId(self, value):
|
||||
if value is not None:
|
||||
self._element.set(u"publicId", value)
|
||||
|
||||
publicId = property(_getPublicId, _setPublicId)
|
||||
|
||||
def _getSystemId(self):
|
||||
return self._element.get(u"systemId", "")
|
||||
|
||||
def _setSystemId(self, value):
|
||||
if value is not None:
|
||||
self._element.set(u"systemId", value)
|
||||
|
||||
systemId = property(_getSystemId, _setSystemId)
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "<DOCUMENT_ROOT>")
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if not(hasattr(element, "tag")):
|
||||
element = element.getroot()
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
|
||||
element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag == ElementTree.Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
assert type(element.tag) in types.StringTypes, "Expected unicode, got %s"%type(element.tag)
|
||||
nsmatch = tag_regexp.match(element.tag)
|
||||
|
||||
if nsmatch is None:
|
||||
name = element.tag
|
||||
else:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
name = "%s %s"%(prefix, name)
|
||||
rv.append("|%s<%s>"%(' '*indent, name))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
attributes = []
|
||||
for name, value in element.attrib.iteritems():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
attr_string = "%s %s"%(prefix, name)
|
||||
else:
|
||||
attr_string = name
|
||||
attributes.append((attr_string, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element:
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
filter = ihatexml.InfosetFilter()
|
||||
def serializeElement(element):
|
||||
if type(element) == type(ElementTree.ElementTree):
|
||||
element = element.getroot()
|
||||
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
|
||||
element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
elif type(element.tag) == type(ElementTree.Comment):
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(
|
||||
filter.fromXmlName(name), value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._element
|
||||
else:
|
||||
if self.defaultNamespace is not None:
|
||||
return self.document._element.find(
|
||||
"{%s}html"%self.defaultNamespace)
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
||||
336
libs/html5lib/treebuilders/etree_lxml.py
Normal file
336
libs/html5lib/treebuilders/etree_lxml.py
Normal file
@@ -0,0 +1,336 @@
|
||||
import warnings
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import DataLossWarning
|
||||
import html5lib.constants as constants
|
||||
import etree as etree_builders
|
||||
from html5lib import ihatexml
|
||||
|
||||
try:
|
||||
import lxml.etree as etree
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
fullTree = True
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||
of the native library as possible, without using fragile hacks like custom element
|
||||
names that break between releases. The downside of this is that we cannot represent
|
||||
all possible trees; specifically the following are known to cause problems:
|
||||
|
||||
Text or comments as siblings of the root element
|
||||
Docypes with no name
|
||||
|
||||
When any of these things occur, we emit a DataLossWarning
|
||||
"""
|
||||
|
||||
class DocumentType(object):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
self.name = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
class Document(object):
|
||||
def __init__(self):
|
||||
self._elementTree = None
|
||||
self._childNodes = []
|
||||
|
||||
def appendChild(self, element):
|
||||
self._elementTree.getroot().addnext(element._element)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
childNodes = property(_getChildNodes)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
filter = ihatexml.InfosetFilter()
|
||||
def serializeElement(element, indent=0):
|
||||
if not hasattr(element, "tag"):
|
||||
if hasattr(element, "getroot"):
|
||||
#Full tree case
|
||||
rv.append("#document")
|
||||
if element.docinfo.internalDTD:
|
||||
if not (element.docinfo.public_id or
|
||||
element.docinfo.system_url):
|
||||
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
|
||||
else:
|
||||
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
|
||||
element.docinfo.root_name,
|
||||
element.docinfo.public_id,
|
||||
element.docinfo.system_url)
|
||||
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
|
||||
next_element = element.getroot()
|
||||
while next_element.getprevious() is not None:
|
||||
next_element = next_element.getprevious()
|
||||
while next_element is not None:
|
||||
serializeElement(next_element, indent+2)
|
||||
next_element = next_element.getnext()
|
||||
elif isinstance(element, basestring):
|
||||
#Text in a fragment
|
||||
rv.append("|%s\"%s\""%(' '*indent, element))
|
||||
else:
|
||||
#Fragment case
|
||||
rv.append("#document-fragment")
|
||||
for next_element in element:
|
||||
serializeElement(next_element, indent+2)
|
||||
elif type(element.tag) == type(etree.Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||
if nsmatch is not None:
|
||||
ns = nsmatch.group(1)
|
||||
tag = nsmatch.group(2)
|
||||
prefix = constants.prefixes[ns]
|
||||
rv.append("|%s<%s %s>"%(' '*indent, prefix,
|
||||
filter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent,
|
||||
filter.fromXmlName(element.tag)))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
attributes = []
|
||||
for name, value in element.attrib.iteritems():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
name = filter.fromXmlName(name)
|
||||
prefix = constants.prefixes[ns]
|
||||
attr_string = "%s %s"%(prefix, name)
|
||||
else:
|
||||
attr_string = filter.fromXmlName(name)
|
||||
attributes.append((attr_string, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if not hasattr(element, "tag"):
|
||||
if element.docinfo.internalDTD:
|
||||
if element.docinfo.doctype:
|
||||
dtd_str = element.docinfo.doctype
|
||||
else:
|
||||
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
|
||||
rv.append(dtd_str)
|
||||
serializeElement(element.getroot())
|
||||
|
||||
elif type(element.tag) == type(etree.Comment):
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
commentClass = None
|
||||
fragmentClass = Document
|
||||
|
||||
def __init__(self, namespaceHTMLElements, fullTree = False):
|
||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||
filter = self.filter = ihatexml.InfosetFilter()
|
||||
self.namespaceHTMLElements = namespaceHTMLElements
|
||||
|
||||
class Attributes(dict):
|
||||
def __init__(self, element, value={}):
|
||||
self._element = element
|
||||
dict.__init__(self, value)
|
||||
for key, value in self.iteritems():
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
||||
else:
|
||||
name = filter.coerceAttribute(key)
|
||||
self._element._element.attrib[name] = value
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
dict.__setitem__(self, key, value)
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
||||
else:
|
||||
name = filter.coerceAttribute(key)
|
||||
self._element._element.attrib[name] = value
|
||||
|
||||
class Element(builder.Element):
|
||||
def __init__(self, name, namespace):
|
||||
name = filter.coerceElement(name)
|
||||
builder.Element.__init__(self, name, namespace=namespace)
|
||||
self._attributes = Attributes(self)
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = filter.coerceElement(name)
|
||||
self._element.tag = self._getETreeTag(
|
||||
self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return filter.fromXmlName(self._name)
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._attributes
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
self._attributes = Attributes(self, attributes)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data = filter.coerceCharacters(data)
|
||||
builder.Element.insertText(self, data, insertBefore)
|
||||
|
||||
def appendChild(self, child):
|
||||
builder.Element.appendChild(self, child)
|
||||
|
||||
|
||||
class Comment(builder.Comment):
|
||||
def __init__(self, data):
|
||||
data = filter.coerceComment(data)
|
||||
builder.Comment.__init__(self, data)
|
||||
|
||||
def _setData(self, data):
|
||||
data = filter.coerceComment(data)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
self.elementClass = Element
|
||||
self.commentClass = builder.Comment
|
||||
#self.fragmentClass = builder.DocumentFragment
|
||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def reset(self):
|
||||
_base.TreeBuilder.reset(self)
|
||||
self.insertComment = self.insertCommentInitial
|
||||
self.initial_comments = []
|
||||
self.doctype = None
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._elementTree
|
||||
else:
|
||||
return self.document._elementTree.getroot()
|
||||
|
||||
def getFragment(self):
|
||||
fragment = []
|
||||
element = self.openElements[0]._element
|
||||
if element.text:
|
||||
fragment.append(element.text)
|
||||
fragment.extend(element.getchildren())
|
||||
if element.tail:
|
||||
fragment.append(element.tail)
|
||||
return fragment
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
|
||||
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
|
||||
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
def insertCommentInitial(self, data, parent=None):
|
||||
self.initial_comments.append(data)
|
||||
|
||||
def insertRoot(self, token):
|
||||
"""Create the document root"""
|
||||
#Because of the way libxml2 works, it doesn't seem to be possible to
|
||||
#alter information like the doctype after the tree has been parsed.
|
||||
#Therefore we need to use the built-in parser to create our iniial
|
||||
#tree, after which we can add elements like normal
|
||||
docStr = ""
|
||||
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
|
||||
docStr += "<!DOCTYPE %s"%self.doctype.name
|
||||
if (self.doctype.publicId is not None or
|
||||
self.doctype.systemId is not None):
|
||||
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
|
||||
self.doctype.systemId or "")
|
||||
docStr += ">"
|
||||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
||||
|
||||
try:
|
||||
root = etree.fromstring(docStr)
|
||||
except etree.XMLSyntaxError:
|
||||
print docStr
|
||||
raise
|
||||
|
||||
#Append the initial comments:
|
||||
for comment_token in self.initial_comments:
|
||||
root.addprevious(etree.Comment(comment_token["data"]))
|
||||
|
||||
#Create the root document and add the ElementTree to it
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
# Give the root element the right name
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s"%(namespace, name)
|
||||
root.tag = etree_tag
|
||||
|
||||
#Add the root element to the internal child/open data structures
|
||||
root_element = self.elementClass(name, namespace)
|
||||
root_element._element = root
|
||||
self.document._childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
||||
#Reset to the default insert comment function
|
||||
self.insertComment = super(TreeBuilder, self).insertComment
|
||||
256
libs/html5lib/treebuilders/simpletree.py
Executable file
256
libs/html5lib/treebuilders/simpletree.py
Executable file
@@ -0,0 +1,256 @@
|
||||
import _base
|
||||
from html5lib.constants import voidElements, namespaces, prefixes
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
class Node(_base.Node):
|
||||
type = -1
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self.value = None
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __iter__(self):
|
||||
for node in self.childNodes:
|
||||
yield node
|
||||
for item in node:
|
||||
yield item
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def toxml(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def printTree(self, indent=0):
|
||||
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent + 2)
|
||||
return tree
|
||||
|
||||
def appendChild(self, node):
|
||||
assert isinstance(node, Node)
|
||||
if (isinstance(node, TextNode) and self.childNodes and
|
||||
isinstance(self.childNodes[-1], TextNode)):
|
||||
self.childNodes[-1].value += node.value
|
||||
else:
|
||||
self.childNodes.append(node)
|
||||
node.parent = self
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
assert isinstance(data, unicode), "data %s is of type %s expected unicode"%(repr(data), type(data))
|
||||
if insertBefore is None:
|
||||
self.appendChild(TextNode(data))
|
||||
else:
|
||||
self.insertBefore(TextNode(data), insertBefore)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.childNodes.index(refNode)
|
||||
if (isinstance(node, TextNode) and index > 0 and
|
||||
isinstance(self.childNodes[index - 1], TextNode)):
|
||||
self.childNodes[index - 1].value += node.value
|
||||
else:
|
||||
self.childNodes.insert(index, node)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
try:
|
||||
self.childNodes.remove(node)
|
||||
except:
|
||||
# XXX
|
||||
raise
|
||||
node.parent = None
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self.childNodes)
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class Document(Node):
|
||||
type = 1
|
||||
def __init__(self):
|
||||
Node.__init__(self, None)
|
||||
|
||||
def __str__(self):
|
||||
return "#document"
|
||||
|
||||
def __unicode__(self):
|
||||
return str(self)
|
||||
|
||||
def appendChild(self, child):
|
||||
Node.appendChild(self, child)
|
||||
|
||||
def toxml(self, encoding="utf=8"):
|
||||
result = ""
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
return result.encode(encoding)
|
||||
|
||||
def hilite(self, encoding="utf-8"):
|
||||
result = "<pre>"
|
||||
for child in self.childNodes:
|
||||
result += child.hilite()
|
||||
return result.encode(encoding) + "</pre>"
|
||||
|
||||
def printTree(self):
|
||||
tree = unicode(self)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(2)
|
||||
return tree
|
||||
|
||||
def cloneNode(self):
|
||||
return Document()
|
||||
|
||||
class DocumentFragment(Document):
|
||||
type = 2
|
||||
def __str__(self):
|
||||
return "#document-fragment"
|
||||
|
||||
def __unicode__(self):
|
||||
return str(self)
|
||||
|
||||
def cloneNode(self):
|
||||
return DocumentFragment()
|
||||
|
||||
class DocumentType(Node):
|
||||
type = 3
|
||||
def __init__(self, name, publicId, systemId):
|
||||
Node.__init__(self, name)
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
def __unicode__(self):
|
||||
if self.publicId or self.systemId:
|
||||
publicId = self.publicId or ""
|
||||
systemId = self.systemId or ""
|
||||
return """<!DOCTYPE %s "%s" "%s">"""%(
|
||||
self.name, publicId, systemId)
|
||||
|
||||
else:
|
||||
return u"<!DOCTYPE %s>" % self.name
|
||||
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
def hilite(self):
|
||||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
||||
|
||||
def cloneNode(self):
|
||||
return DocumentType(self.name, self.publicId, self.systemId)
|
||||
|
||||
class TextNode(Node):
|
||||
type = 4
|
||||
def __init__(self, value):
|
||||
Node.__init__(self, None)
|
||||
self.value = value
|
||||
|
||||
def __unicode__(self):
|
||||
return u"\"%s\"" % self.value
|
||||
|
||||
def toxml(self):
|
||||
return escape(self.value)
|
||||
|
||||
hilite = toxml
|
||||
|
||||
def cloneNode(self):
|
||||
return TextNode(self.value)
|
||||
|
||||
class Element(Node):
|
||||
type = 5
|
||||
def __init__(self, name, namespace=None):
|
||||
Node.__init__(self, name)
|
||||
self.namespace = namespace
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
if self.namespace == None:
|
||||
return u"<%s>" % self.name
|
||||
else:
|
||||
return u"<%s %s>"%(prefixes[self.namespace], self.name)
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
if self.attributes:
|
||||
for name,value in self.attributes.iteritems():
|
||||
result += u' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += '>'
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
result += u'</%s>' % self.name
|
||||
else:
|
||||
result += u'/>'
|
||||
return result
|
||||
|
||||
def hilite(self):
|
||||
result = '<<code class="markup element-name">%s</code>' % self.name
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += ">"
|
||||
for child in self.childNodes:
|
||||
result += child.hilite()
|
||||
elif self.name in voidElements:
|
||||
return result + ">"
|
||||
return result + '</<code class="markup element-name">%s</code>>' % self.name
|
||||
|
||||
def printTree(self, indent):
|
||||
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
||||
indent += 2
|
||||
if self.attributes:
|
||||
for name, value in sorted(self.attributes.iteritems()):
|
||||
if isinstance(name, tuple):
|
||||
name = "%s %s"%(name[0], name[1])
|
||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent)
|
||||
return tree
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = Element(self.name)
|
||||
if hasattr(self, 'namespace'):
|
||||
newNode.namespace = self.namespace
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
return newNode
|
||||
|
||||
class CommentNode(Node):
|
||||
type = 6
|
||||
def __init__(self, data):
|
||||
Node.__init__(self, None)
|
||||
self.data = data
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!-- %s -->" % self.data
|
||||
|
||||
def toxml(self):
|
||||
return "<!--%s-->" % self.data
|
||||
|
||||
def hilite(self):
|
||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||
|
||||
def cloneNode(self):
|
||||
return CommentNode(self.data)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = CommentNode
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, node):
|
||||
return node.printTree()
|
||||
236
libs/html5lib/treebuilders/soup.py
Normal file
236
libs/html5lib/treebuilders/soup.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import warnings
|
||||
|
||||
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||
|
||||
import _base
|
||||
from html5lib.constants import namespaces, DataLossWarning
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return self.attrs.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return self.attrs.items()
|
||||
def keys(self):
|
||||
return self.attrs.keys()
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in self.attrs.keys()
|
||||
def __eq__(self, other):
|
||||
if len(self.keys()) != len(other.keys()):
|
||||
return False
|
||||
for item in self.keys():
|
||||
if item not in other:
|
||||
return False
|
||||
if self[item] != other[item]:
|
||||
return False
|
||||
return True
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def _nodeIndex(self, node, refNode):
|
||||
# Finds a node by identity rather than equality
|
||||
for index in range(len(self.element.contents)):
|
||||
if id(self.element.contents[index]) == id(refNode.element):
|
||||
return index
|
||||
return None
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# Concatenate new text onto old text node
|
||||
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
|
||||
newStr = NavigableString(self.element.contents[-1]+node.element)
|
||||
|
||||
# Remove the old text node
|
||||
# (Can't simply use .extract() by itself, because it fails if
|
||||
# an equal text node exists within the parent node)
|
||||
oldElement = self.element.contents[-1]
|
||||
del self.element.contents[-1]
|
||||
oldElement.parent = None
|
||||
oldElement.extract()
|
||||
|
||||
self.element.insert(len(self.element.contents), newStr)
|
||||
else:
|
||||
self.element.insert(len(self.element.contents), node.element)
|
||||
node.parent = self
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(NavigableString(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._nodeIndex(node, refNode)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
newStr = NavigableString(self.element.contents[index-1]+node.element)
|
||||
oldNode = self.element.contents[index-1]
|
||||
del self.element.contents[index-1]
|
||||
oldNode.parent = None
|
||||
oldNode.extract()
|
||||
|
||||
self.element.insert(index-1, newStr)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
index = self._nodeIndex(node.parent, node)
|
||||
del node.parent.element.contents[index]
|
||||
node.element.parent = None
|
||||
node.element.extract()
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.contents:
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
|
||||
else:
|
||||
newParent.appendChild(TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
_base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def __init__(self, namespaceHTMLElements):
|
||||
if namespaceHTMLElements:
|
||||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
|
||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if publicId:
|
||||
self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
|
||||
elif systemId:
|
||||
self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
|
||||
(name, systemId)))
|
||||
else:
|
||||
self.soup.insert(0, Declaration("DOCTYPE %s"%name))
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
if namespace is not None:
|
||||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
|
||||
return Element(Tag(self.soup, name), self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
self.soup.insert(len(self.soup.contents), node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(element):
|
||||
import re
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, Declaration):
|
||||
doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
|
||||
m = re.compile(doctype_regexp).match(element.string)
|
||||
assert m is not None, "DOCTYPE did not match expected format"
|
||||
name = m.group('name')
|
||||
publicId = m.group('publicId')
|
||||
if publicId is not None:
|
||||
systemId = m.group('systemId1') or ""
|
||||
else:
|
||||
systemId = m.group('systemId2')
|
||||
|
||||
if publicId is not None or systemId is not None:
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
|
||||
(' '*indent, name, publicId or "", systemId or ""))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
|
||||
|
||||
elif isinstance(element, BeautifulSoup):
|
||||
if element.name == "[document_fragment]":
|
||||
rv.append("#document-fragment")
|
||||
else:
|
||||
rv.append("#document")
|
||||
|
||||
elif isinstance(element, Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
|
||||
elif isinstance(element, unicode):
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.name))
|
||||
if element.attrs:
|
||||
for name, value in sorted(element.attrs):
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
if hasattr(element, "contents"):
|
||||
for child in element.contents:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
52
libs/html5lib/treewalkers/__init__.py
Normal file
52
libs/html5lib/treewalkers/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""A collection of modules for iterating through different kinds of
|
||||
tree, generating tokens identical to those produced by the tokenizer
|
||||
module.
|
||||
|
||||
To create a tree walker for a new type of tree, you need to do
|
||||
implement a tree walker object (called TreeWalker by convention) that
|
||||
implements a 'serialize' method taking a tree as sole argument and
|
||||
returning an iterator generating tokens.
|
||||
"""
|
||||
|
||||
treeWalkerCache = {}
|
||||
|
||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeWalker class for various types of tree with built-in support
|
||||
|
||||
treeType - the name of the tree type required (case-insensitive). Supported
|
||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"pulldom" - The xml.dom.pulldom event stream
|
||||
"etree" - A generic walker for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"lxml" - Optimized walker for lxml.etree
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
"genshi" - a Genshi stream
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
cElementTree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeWalkerCache:
|
||||
if treeType in ("dom", "pulldom", "simpletree"):
|
||||
mod = __import__(treeType, globals())
|
||||
treeWalkerCache[treeType] = mod.TreeWalker
|
||||
elif treeType == "genshi":
|
||||
import genshistream
|
||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeWalkerCache[treeType] = soup.TreeWalker
|
||||
elif treeType == "lxml":
|
||||
import lxmletree
|
||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
||||
176
libs/html5lib/treewalkers/_base.py
Normal file
176
libs/html5lib/treewalkers/_base.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class TreeWalker(object):
|
||||
def __init__(self, tree):
|
||||
self.tree = tree
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def error(self, msg):
|
||||
return {"type": "SerializeError", "data": msg}
|
||||
|
||||
def normalizeAttrs(self, attrs):
|
||||
newattrs = {}
|
||||
if attrs:
|
||||
#TODO: treewalkers should always have attrs
|
||||
for (namespace,name),value in attrs.iteritems():
|
||||
namespace = unicode(namespace) if namespace else None
|
||||
name = unicode(name)
|
||||
value = unicode(value)
|
||||
newattrs[(namespace,name)] = value
|
||||
return newattrs
|
||||
|
||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||
yield {"type": "EmptyTag", "name": unicode(name),
|
||||
"namespace":unicode(namespace),
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
if hasChildren:
|
||||
yield self.error(_("Void element has children"))
|
||||
|
||||
def startTag(self, namespace, name, attrs):
|
||||
return {"type": "StartTag",
|
||||
"name": unicode(name),
|
||||
"namespace":unicode(namespace),
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
|
||||
def endTag(self, namespace, name):
|
||||
return {"type": "EndTag",
|
||||
"name": unicode(name),
|
||||
"namespace":unicode(namespace),
|
||||
"data": {}}
|
||||
|
||||
def text(self, data):
|
||||
data = unicode(data)
|
||||
middle = data.lstrip(spaceCharacters)
|
||||
left = data[:len(data)-len(middle)]
|
||||
if left:
|
||||
yield {"type": "SpaceCharacters", "data": left}
|
||||
data = middle
|
||||
middle = data.rstrip(spaceCharacters)
|
||||
right = data[len(middle):]
|
||||
if middle:
|
||||
yield {"type": "Characters", "data": middle}
|
||||
if right:
|
||||
yield {"type": "SpaceCharacters", "data": right}
|
||||
|
||||
def comment(self, data):
|
||||
return {"type": "Comment", "data": unicode(data)}
|
||||
|
||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||
return {"type": "Doctype",
|
||||
"name": name is not None and unicode(name) or u"",
|
||||
"publicId": publicId,
|
||||
"systemId": systemId,
|
||||
"correct": correct}
|
||||
|
||||
def entity(self, name):
|
||||
return {"type": "Entity", "name": unicode(name)}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error(_("Unknown node type: ") + nodeType)
|
||||
|
||||
class RecursiveTreeWalker(TreeWalker):
|
||||
def walkChildren(self, node):
|
||||
raise NodeImplementedError
|
||||
|
||||
def element(self, node, namespace, name, attrs, hasChildren):
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, attrs, hasChildren):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, attrs)
|
||||
if hasChildren:
|
||||
for token in self.walkChildren(node):
|
||||
yield token
|
||||
yield self.endTag(name)
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
DOCUMENT = Node.DOCUMENT_NODE
|
||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
ELEMENT = Node.ELEMENT_NODE
|
||||
COMMENT = Node.COMMENT_NODE
|
||||
ENTITY = Node.ENTITY_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getFirstChild(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getNextSibling(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getParentNode(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def __iter__(self):
|
||||
currentNode = self.tree
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
hasChildren = False
|
||||
endTag = None
|
||||
|
||||
if type == DOCTYPE:
|
||||
yield self.doctype(*details)
|
||||
|
||||
elif type == TEXT:
|
||||
for token in self.text(*details):
|
||||
yield token
|
||||
|
||||
elif type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, attributes,
|
||||
hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
endTag = name
|
||||
yield self.startTag(namespace, name, attributes)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
|
||||
elif type == ENTITY:
|
||||
yield self.entity(details[0])
|
||||
|
||||
elif type == DOCUMENT:
|
||||
hasChildren = True
|
||||
|
||||
else:
|
||||
yield self.unknown(details[0])
|
||||
|
||||
if hasChildren:
|
||||
firstChild = self.getFirstChild(currentNode)
|
||||
else:
|
||||
firstChild = None
|
||||
|
||||
if firstChild is not None:
|
||||
currentNode = firstChild
|
||||
else:
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
break
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
break
|
||||
else:
|
||||
currentNode = self.getParentNode(currentNode)
|
||||
41
libs/html5lib/treewalkers/dom.py
Normal file
41
libs/html5lib/treewalkers/dom.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from xml.dom import Node
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return _base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
attrs = {}
|
||||
for attr in node.attributes.keys():
|
||||
attr = node.getAttributeNode(attr)
|
||||
attrs[(attr.namespaceURI,attr.localName)] = attr.value
|
||||
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||
attrs, node.hasChildNodes())
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return _base.COMMENT, node.nodeValue
|
||||
|
||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
else:
|
||||
return _base.UNKNOWN, node.nodeType
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.firstChild
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parentNode
|
||||
141
libs/html5lib/treewalkers/etree.py
Normal file
141
libs/html5lib/treewalkers/etree.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
try:
|
||||
from types import ModuleType
|
||||
except:
|
||||
from new import module as ModuleType
|
||||
import copy
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation):
|
||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder")
|
||||
objs = getETreeBuilder(ElementTreeImplementation)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation):
|
||||
ElementTree = ElementTreeImplementation
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
"""Given the particular ElementTree representation, this implementation,
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
||||
1. The current element
|
||||
|
||||
2. The index of the element relative to its parent
|
||||
|
||||
3. A stack of ancestor elements
|
||||
|
||||
4. A flag "text", "tail" or None to indicate if the current node is a
|
||||
text node; either the text or tail of the current element (1)
|
||||
"""
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents, flag = node
|
||||
if flag in ("text", "tail"):
|
||||
return _base.TEXT, getattr(elt, flag)
|
||||
else:
|
||||
node = elt
|
||||
|
||||
if not(hasattr(node, "tag")):
|
||||
node = node.getroot()
|
||||
|
||||
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return (_base.DOCTYPE, node.text,
|
||||
node.get("publicId"), node.get("systemId"))
|
||||
|
||||
elif node.tag == ElementTree.Comment:
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
assert type(node.tag) in (str, unicode), type(node.tag)
|
||||
#This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = node.tag
|
||||
attrs = {}
|
||||
for name, value in node.attrib.items():
|
||||
match = tag_regexp.match(name)
|
||||
if match:
|
||||
attrs[(match.group(1),match.group(2))] = value
|
||||
else:
|
||||
attrs[(None,name)] = value
|
||||
return (_base.ELEMENT, namespace, tag,
|
||||
attrs, len(node) or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
element, key, parents, flag = node, None, [], None
|
||||
|
||||
if flag in ("text", "tail"):
|
||||
return None
|
||||
else:
|
||||
if element.text:
|
||||
return element, key, parents, "text"
|
||||
elif len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
if element.tail and flag != "tail":
|
||||
return element, key, parents, "tail"
|
||||
elif key < len(parents[-1]) - 1:
|
||||
return parents[-1][key+1], key+1, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if not parents:
|
||||
return element
|
||||
else:
|
||||
return element, key, parents, None
|
||||
else:
|
||||
parent = parents.pop()
|
||||
if not parents:
|
||||
return parent
|
||||
else:
|
||||
return parent, list(parents[-1]).index(parent), parents, None
|
||||
|
||||
return locals()
|
||||
70
libs/html5lib/treewalkers/genshistream.py
Normal file
70
libs/html5lib/treewalkers/genshistream.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
from genshi.output import NamespaceFlattener
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
def __iter__(self):
|
||||
depth = 0
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in self.tree:
|
||||
if previous is not None:
|
||||
if previous[0] == START:
|
||||
depth += 1
|
||||
if ignore_until <= depth:
|
||||
ignore_until = None
|
||||
if ignore_until is None:
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
if token["type"] == "EmptyTag":
|
||||
ignore_until = depth
|
||||
if previous[0] == END:
|
||||
depth -= 1
|
||||
previous = event
|
||||
if previous is not None:
|
||||
if ignore_until is None or ignore_until <= depth:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
elif ignore_until is not None:
|
||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||
|
||||
def tokens(self, event, next):
|
||||
kind, data, pos = event
|
||||
if kind == START:
|
||||
tag, attrib = data
|
||||
name = tag.localname
|
||||
namespace = tag.namespace
|
||||
if tag in voidElements:
|
||||
for token in self.emptyTag(namespace, name, list(attrib),
|
||||
not next or next[0] != END
|
||||
or next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(namespace, name, list(attrib))
|
||||
|
||||
elif kind == END:
|
||||
name = data.localname
|
||||
namespace = data.namespace
|
||||
if name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif kind == COMMENT:
|
||||
yield self.comment(data)
|
||||
|
||||
elif kind == TEXT:
|
||||
for token in self.text(data):
|
||||
yield token
|
||||
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(*data)
|
||||
|
||||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
|
||||
START_CDATA, END_CDATA, PI):
|
||||
pass
|
||||
|
||||
else:
|
||||
yield self.unknown(kind)
|
||||
186
libs/html5lib/treewalkers/lxmletree.py
Normal file
186
libs/html5lib/treewalkers/lxmletree.py
Normal file
@@ -0,0 +1,186 @@
|
||||
from lxml import etree
|
||||
from html5lib.treebuilders.etree import tag_regexp
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
from html5lib import ihatexml
|
||||
|
||||
class Root(object):
|
||||
def __init__(self, et):
|
||||
self.elementtree = et
|
||||
self.children = []
|
||||
if et.docinfo.internalDTD:
|
||||
self.children.append(Doctype(self, et.docinfo.root_name,
|
||||
et.docinfo.public_id,
|
||||
et.docinfo.system_url))
|
||||
root = et.getroot()
|
||||
node = root
|
||||
|
||||
while node.getprevious() is not None:
|
||||
node = node.getprevious()
|
||||
while node is not None:
|
||||
self.children.append(node)
|
||||
node = node.getnext()
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.children[key]
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
def __len__(self):
|
||||
return 1
|
||||
|
||||
class Doctype(object):
|
||||
def __init__(self, root_node, name, public_id, system_id):
|
||||
self.root_node = root_node
|
||||
self.name = name
|
||||
self.public_id = public_id
|
||||
self.system_id = system_id
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return self.root_node.children[1]
|
||||
|
||||
class FragmentRoot(Root):
|
||||
def __init__(self, children):
|
||||
self.children = [FragmentWrapper(self, child) for child in children]
|
||||
self.text = self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
class FragmentWrapper(object):
|
||||
def __init__(self, fragment_root, obj):
|
||||
self.root_node = fragment_root
|
||||
self.obj = obj
|
||||
if hasattr(self.obj, 'text'):
|
||||
self.text = self.obj.text
|
||||
else:
|
||||
self.text = None
|
||||
if hasattr(self.obj, 'tail'):
|
||||
self.tail = self.obj.tail
|
||||
else:
|
||||
self.tail = None
|
||||
self.isstring = isinstance(obj, basestring)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.obj, name)
|
||||
|
||||
def getnext(self):
|
||||
siblings = self.root_node.children
|
||||
idx = siblings.index(self)
|
||||
if idx < len(siblings) - 1:
|
||||
return siblings[idx + 1]
|
||||
else:
|
||||
return None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.obj[key]
|
||||
|
||||
def __nonzero__(self):
|
||||
return bool(self.obj)
|
||||
|
||||
def getparent(self):
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __unicode__(self):
|
||||
return unicode(self.obj)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.obj)
|
||||
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def __init__(self, tree):
|
||||
if hasattr(tree, "getroot"):
|
||||
tree = Root(tree)
|
||||
elif isinstance(tree, list):
|
||||
tree = FragmentRoot(tree)
|
||||
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||
self.filter = ihatexml.InfosetFilter()
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
return _base.TEXT, getattr(node, key)
|
||||
|
||||
elif isinstance(node, Root):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Doctype):
|
||||
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||
|
||||
elif isinstance(node, FragmentWrapper) and node.isstring:
|
||||
return _base.TEXT, node
|
||||
|
||||
elif node.tag == etree.Comment:
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
elif node.tag == etree.Entity:
|
||||
return _base.ENTITY, node.text[1:-1] # strip &;
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = node.tag
|
||||
attrs = {}
|
||||
for name, value in node.attrib.items():
|
||||
match = tag_regexp.match(name)
|
||||
if match:
|
||||
attrs[(match.group(1),match.group(2))] = value
|
||||
else:
|
||||
attrs[(None,name)] = value
|
||||
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||
attrs, len(node) > 0 or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
assert not isinstance(node, tuple), _("Text nodes have no children")
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text")
|
||||
else:
|
||||
return node[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
if key == "text":
|
||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||
# because node[0] might evaluate to False if it has no child element
|
||||
if len(node):
|
||||
return node[0]
|
||||
else:
|
||||
return None
|
||||
else: # tail
|
||||
return node.getnext()
|
||||
|
||||
return node.tail and (node, "tail") or node.getnext()
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
if key == "text":
|
||||
return node
|
||||
# else: fallback to "normal" processing
|
||||
|
||||
return node.getparent()
|
||||
60
libs/html5lib/treewalkers/pulldom.py
Normal file
60
libs/html5lib/treewalkers/pulldom.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
def __iter__(self):
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in self.tree:
|
||||
if previous is not None and \
|
||||
(ignore_until is None or previous[1] is ignore_until):
|
||||
if previous[1] is ignore_until:
|
||||
ignore_until = None
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
if token["type"] == "EmptyTag":
|
||||
ignore_until = previous[1]
|
||||
previous = event
|
||||
if ignore_until is None or previous[1] is ignore_until:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
elif ignore_until is not None:
|
||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||
|
||||
def tokens(self, event, next):
|
||||
type, node = event
|
||||
if type == START_ELEMENT:
|
||||
name = node.nodeName
|
||||
namespace = node.namespaceURI
|
||||
attrs = {}
|
||||
for attr in node.attributes.keys():
|
||||
attr = node.getAttributeNode(attr)
|
||||
attrs[(attr.namespaceURI,attr.localName)] = attr.value
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(namespace,
|
||||
name,
|
||||
attrs,
|
||||
not next or next[1] is not node):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(namespace, name, attrs)
|
||||
|
||||
elif type == END_ELEMENT:
|
||||
name = node.nodeName
|
||||
namespace = node.namespaceURI
|
||||
if name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(node.nodeValue)
|
||||
|
||||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
||||
for token in self.text(node.nodeValue):
|
||||
yield token
|
||||
|
||||
else:
|
||||
yield self.unknown(type)
|
||||
78
libs/html5lib/treewalkers/simpletree.py
Normal file
78
libs/html5lib/treewalkers/simpletree.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
"""Given that simpletree has no performant way of getting a node's
|
||||
next sibling, this implementation returns "nodes" as tuples with the
|
||||
following content:
|
||||
|
||||
1. The parent Node (Element, Document or DocumentFragment)
|
||||
|
||||
2. The child index of the current node in its parent's children list
|
||||
|
||||
3. A list used as a stack of all ancestors. It is a pair tuple whose
|
||||
first item is a parent Node and second item is a child index.
|
||||
"""
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Node
|
||||
parent, idx, parents = node
|
||||
node = parent.childNodes[idx]
|
||||
|
||||
# testing node.type allows us not to import treebuilders.simpletree
|
||||
if node.type in (1, 2): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.type == 3: # DocumentType
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.type == 4: # TextNode
|
||||
return _base.TEXT, node.value
|
||||
|
||||
elif node.type == 5: # Element
|
||||
attrs = {}
|
||||
for name, value in node.attributes.items():
|
||||
if isinstance(name, tuple):
|
||||
attrs[(name[2],name[1])] = value
|
||||
else:
|
||||
attrs[(None,name)] = value
|
||||
return (_base.ELEMENT, node.namespace, node.name,
|
||||
attrs, node.hasContent())
|
||||
|
||||
elif node.type == 6: # CommentNode
|
||||
return _base.COMMENT, node.data
|
||||
|
||||
else:
|
||||
return _node.UNKNOWN, node.type
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Node
|
||||
parent, idx, parents = node
|
||||
parents.append((parent, idx))
|
||||
node = parent.childNodes[idx]
|
||||
else:
|
||||
parents = []
|
||||
|
||||
assert node.hasContent(), "Node has no children"
|
||||
return (node, 0, parents)
|
||||
|
||||
def getNextSibling(self, node):
|
||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||
parent, idx, parents = node
|
||||
idx += 1
|
||||
if len(parent.childNodes) > idx:
|
||||
return (parent, idx, parents)
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
assert isinstance(node, tuple)
|
||||
parent, idx, parents = node
|
||||
if parents:
|
||||
parent, idx = parents.pop()
|
||||
return parent, idx, parents
|
||||
else:
|
||||
# HACK: We could return ``parent`` but None will stop the algorithm the same way
|
||||
return None
|
||||
60
libs/html5lib/treewalkers/soup.py
Normal file
60
libs/html5lib/treewalkers/soup.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import re
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
||||
from html5lib.constants import namespaces
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
doctype_regexp = re.compile(
|
||||
r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Declaration): # DocumentType
|
||||
string = unicode(node.string)
|
||||
#Slice needed to remove markup added during unicode conversion,
|
||||
#but only in some versions of BeautifulSoup/Python
|
||||
if string.startswith('<!') and string.endswith('>'):
|
||||
string = string[2:-1]
|
||||
m = self.doctype_regexp.match(string)
|
||||
#This regexp approach seems wrong and fragile
|
||||
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
|
||||
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
|
||||
#been modified at all
|
||||
#We could just feed to it a html5lib tokenizer, I guess...
|
||||
assert m is not None, "DOCTYPE did not match expected format"
|
||||
|
||||
name = m.group('name')
|
||||
publicId = m.group('publicId')
|
||||
if publicId is not None:
|
||||
systemId = m.group('systemId1')
|
||||
else:
|
||||
systemId = m.group('systemId2')
|
||||
return _base.DOCTYPE, name, publicId or "", systemId or ""
|
||||
|
||||
elif isinstance(node, Comment):
|
||||
string = unicode(node.string)
|
||||
if string.startswith('<!--') and string.endswith('-->'):
|
||||
string = string[4:-3]
|
||||
return _base.COMMENT, string
|
||||
|
||||
elif isinstance(node, unicode): # TextNode
|
||||
return _base.TEXT, node
|
||||
|
||||
elif isinstance(node, Tag): # Element
|
||||
return (_base.ELEMENT, namespaces["html"], node.name,
|
||||
dict(node.attrs).items(), node.contents)
|
||||
else:
|
||||
return _base.UNKNOWN, node.__class__.__name__
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.contents[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parent
|
||||
175
libs/html5lib/utils.py
Normal file
175
libs/html5lib/utils.py
Normal file
@@ -0,0 +1,175 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
#Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
"""Dict with 2 special properties:
|
||||
|
||||
On initiation, keys that are lists, sets or tuples are converted to
|
||||
multiple keys so accessing any one of the items in the original
|
||||
list-like object returns the matching value
|
||||
|
||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||
md["foo"] == "baz"
|
||||
|
||||
A default value which can be set through the default attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, items=()):
|
||||
# Using _dictEntries instead of directly assigning to self is about
|
||||
# twice as fast. Please do careful performance testing before changing
|
||||
# anything here.
|
||||
_dictEntries = []
|
||||
for name,value in items:
|
||||
if type(name) in (list, tuple, frozenset, set):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
||||
|
||||
#Pure python implementation of deque taken from the ASPN Python Cookbook
|
||||
#Original code by Raymond Hettinger
|
||||
|
||||
class deque(object):
|
||||
|
||||
def __init__(self, iterable=(), maxsize=-1):
|
||||
if not hasattr(self, 'data'):
|
||||
self.left = self.right = 0
|
||||
self.data = {}
|
||||
self.maxsize = maxsize
|
||||
self.extend(iterable)
|
||||
|
||||
def append(self, x):
|
||||
self.data[self.right] = x
|
||||
self.right += 1
|
||||
if self.maxsize != -1 and len(self) > self.maxsize:
|
||||
self.popleft()
|
||||
|
||||
def appendleft(self, x):
|
||||
self.left -= 1
|
||||
self.data[self.left] = x
|
||||
if self.maxsize != -1 and len(self) > self.maxsize:
|
||||
self.pop()
|
||||
|
||||
def pop(self):
|
||||
if self.left == self.right:
|
||||
raise IndexError('cannot pop from empty deque')
|
||||
self.right -= 1
|
||||
elem = self.data[self.right]
|
||||
del self.data[self.right]
|
||||
return elem
|
||||
|
||||
def popleft(self):
|
||||
if self.left == self.right:
|
||||
raise IndexError('cannot pop from empty deque')
|
||||
elem = self.data[self.left]
|
||||
del self.data[self.left]
|
||||
self.left += 1
|
||||
return elem
|
||||
|
||||
def clear(self):
|
||||
self.data.clear()
|
||||
self.left = self.right = 0
|
||||
|
||||
def extend(self, iterable):
|
||||
for elem in iterable:
|
||||
self.append(elem)
|
||||
|
||||
def extendleft(self, iterable):
|
||||
for elem in iterable:
|
||||
self.appendleft(elem)
|
||||
|
||||
def rotate(self, n=1):
|
||||
if self:
|
||||
n %= len(self)
|
||||
for i in xrange(n):
|
||||
self.appendleft(self.pop())
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i < 0:
|
||||
i += len(self)
|
||||
try:
|
||||
return self.data[i + self.left]
|
||||
except KeyError:
|
||||
raise IndexError
|
||||
|
||||
def __setitem__(self, i, value):
|
||||
if i < 0:
|
||||
i += len(self)
|
||||
try:
|
||||
self.data[i + self.left] = value
|
||||
except KeyError:
|
||||
raise IndexError
|
||||
|
||||
def __delitem__(self, i):
|
||||
size = len(self)
|
||||
if not (-size <= i < size):
|
||||
raise IndexError
|
||||
data = self.data
|
||||
if i < 0:
|
||||
i += size
|
||||
for j in xrange(self.left+i, self.right-1):
|
||||
data[j] = data[j+1]
|
||||
self.pop()
|
||||
|
||||
def __len__(self):
|
||||
return self.right - self.left
|
||||
|
||||
def __cmp__(self, other):
|
||||
if type(self) != type(other):
|
||||
return cmp(type(self), type(other))
|
||||
return cmp(list(self), list(other))
|
||||
|
||||
def __repr__(self, _track=[]):
|
||||
if id(self) in _track:
|
||||
return '...'
|
||||
_track.append(id(self))
|
||||
r = 'deque(%r)' % (list(self),)
|
||||
_track.remove(id(self))
|
||||
return r
|
||||
|
||||
def __getstate__(self):
|
||||
return (tuple(self),)
|
||||
|
||||
def __setstate__(self, s):
|
||||
self.__init__(s[0])
|
||||
|
||||
def __hash__(self):
|
||||
raise TypeError
|
||||
|
||||
def __copy__(self):
|
||||
return self.__class__(self)
|
||||
|
||||
def __deepcopy__(self, memo={}):
|
||||
from copy import deepcopy
|
||||
result = self.__class__()
|
||||
memo[id(self)] = result
|
||||
result.__init__(deepcopy(tuple(self), memo))
|
||||
return result
|
||||
|
||||
#Some utility functions to dal with weirdness around UCS2 vs UCS4
|
||||
#python builds
|
||||
|
||||
def encodingType():
|
||||
if len() == 2:
|
||||
return "UCS2"
|
||||
else:
|
||||
return "UCS4"
|
||||
|
||||
def isSurrogatePair(data):
|
||||
return (len(data) == 2 and
|
||||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||
|
||||
def surrogatePairToCodepoint(data):
|
||||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||
(ord(data[1]) - 0xDC00))
|
||||
return char_val
|
||||
0
libs/oauthlib/__init__.py
Normal file
0
libs/oauthlib/__init__.py
Normal file
155
libs/oauthlib/common.py
Normal file
155
libs/oauthlib/common.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
"""
|
||||
oauthlib.common
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module provides data structures and utilities common
|
||||
to all implementations of OAuth.
|
||||
"""
|
||||
|
||||
import re
|
||||
import urllib
|
||||
import urlparse
|
||||
|
||||
|
||||
always_safe = (u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
u'abcdefghijklmnopqrstuvwxyz'
|
||||
u'0123456789' u'_.-')
|
||||
|
||||
|
||||
def quote(s, safe=u'/'):
|
||||
encoded = s.encode("utf-8")
|
||||
quoted = urllib.quote(encoded, safe)
|
||||
return quoted.decode("utf-8")
|
||||
|
||||
|
||||
def unquote(s):
|
||||
encoded = s.encode("utf-8")
|
||||
unquoted = urllib.unquote(encoded)
|
||||
return unquoted.decode("utf-8")
|
||||
|
||||
|
||||
def urlencode(params):
|
||||
utf8_params = encode_params_utf8(params)
|
||||
urlencoded = urllib.urlencode(utf8_params)
|
||||
return urlencoded.decode("utf-8")
|
||||
|
||||
|
||||
def encode_params_utf8(params):
|
||||
"""Ensures that all parameters in a list of 2-element tuples are encoded to
|
||||
bytestrings using UTF-8
|
||||
"""
|
||||
encoded = []
|
||||
for k, v in params:
|
||||
encoded.append((
|
||||
k.encode('utf-8') if isinstance(k, unicode) else k,
|
||||
v.encode('utf-8') if isinstance(v, unicode) else v))
|
||||
return encoded
|
||||
|
||||
|
||||
def decode_params_utf8(params):
|
||||
"""Ensures that all parameters in a list of 2-element tuples are decoded to
|
||||
unicode using UTF-8.
|
||||
"""
|
||||
decoded = []
|
||||
for k, v in params:
|
||||
decoded.append((
|
||||
k.decode('utf-8') if isinstance(k, str) else k,
|
||||
v.decode('utf-8') if isinstance(v, str) else v))
|
||||
return decoded
|
||||
|
||||
|
||||
urlencoded = set(always_safe) | set(u'=&;%+~')
|
||||
|
||||
|
||||
def urldecode(query):
|
||||
"""Decode a query string in x-www-form-urlencoded format into a sequence
|
||||
of two-element tuples.
|
||||
|
||||
Unlike urlparse.parse_qsl(..., strict_parsing=True) urldecode will enforce
|
||||
correct formatting of the query string by validation. If validation fails
|
||||
a ValueError will be raised. urllib.parse_qsl will only raise errors if
|
||||
any of name-value pairs omits the equals sign.
|
||||
"""
|
||||
# Check if query contains invalid characters
|
||||
if query and not set(query) <= urlencoded:
|
||||
raise ValueError('Invalid characters in query string.')
|
||||
|
||||
# Check for correctly hex encoded values using a regular expression
|
||||
# All encoded values begin with % followed by two hex characters
|
||||
# correct = %00, %A0, %0A, %FF
|
||||
# invalid = %G0, %5H, %PO
|
||||
invalid_hex = u'%[^0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]'
|
||||
if len(re.findall(invalid_hex, query)):
|
||||
raise ValueError('Invalid hex encoding in query string.')
|
||||
|
||||
query = query.decode('utf-8') if isinstance(query, str) else query
|
||||
# We want to allow queries such as "c2" whereas urlparse.parse_qsl
|
||||
# with the strict_parsing flag will not.
|
||||
params = urlparse.parse_qsl(query, keep_blank_values=True)
|
||||
|
||||
# unicode all the things
|
||||
return decode_params_utf8(params)
|
||||
|
||||
|
||||
def extract_params(raw):
|
||||
"""Extract parameters and return them as a list of 2-tuples.
|
||||
|
||||
Will successfully extract parameters from urlencoded query strings,
|
||||
dicts, or lists of 2-tuples. Empty strings/dicts/lists will return an
|
||||
empty list of parameters. Any other input will result in a return
|
||||
value of None.
|
||||
"""
|
||||
if isinstance(raw, basestring):
|
||||
try:
|
||||
params = urldecode(raw)
|
||||
except ValueError:
|
||||
params = None
|
||||
elif hasattr(raw, '__iter__'):
|
||||
try:
|
||||
dict(raw)
|
||||
except ValueError:
|
||||
params = None
|
||||
except TypeError:
|
||||
params = None
|
||||
else:
|
||||
params = list(raw.items() if isinstance(raw, dict) else raw)
|
||||
params = decode_params_utf8(params)
|
||||
else:
|
||||
params = None
|
||||
|
||||
return params
|
||||
|
||||
|
||||
class Request(object):
|
||||
"""A malleable representation of a signable HTTP request.
|
||||
|
||||
Body argument may contain any data, but parameters will only be decoded if
|
||||
they are one of:
|
||||
|
||||
* urlencoded query string
|
||||
* dict
|
||||
* list of 2-tuples
|
||||
|
||||
Anything else will be treated as raw body data to be passed through
|
||||
unmolested.
|
||||
"""
|
||||
|
||||
def __init__(self, uri, http_method=u'GET', body=None, headers=None):
|
||||
self.uri = uri
|
||||
self.http_method = http_method
|
||||
self.headers = headers or {}
|
||||
self.body = body
|
||||
self.decoded_body = extract_params(body)
|
||||
self.oauth_params = []
|
||||
|
||||
@property
|
||||
def uri_query(self):
|
||||
return urlparse.urlparse(self.uri).query
|
||||
|
||||
@property
|
||||
def uri_query_params(self):
|
||||
return urlparse.parse_qsl(self.uri_query, keep_blank_values=True,
|
||||
strict_parsing=True)
|
||||
13
libs/oauthlib/oauth1/__init__.py
Normal file
13
libs/oauthlib/oauth1/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
"""
|
||||
oauthlib.oauth1
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module is a wrapper for the most recent implementation of OAuth 1.0 Client
|
||||
and Server classes.
|
||||
"""
|
||||
|
||||
from .rfc5849 import Client, Server
|
||||
|
||||
350
libs/oauthlib/oauth1/rfc5849/__init__.py
Normal file
350
libs/oauthlib/oauth1/rfc5849/__init__.py
Normal file
@@ -0,0 +1,350 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
"""
|
||||
oauthlib.oauth1.rfc5849
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module is an implementation of various logic needed
|
||||
for signing and checking OAuth 1.0 RFC 5849 requests.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import urlparse
|
||||
|
||||
from oauthlib.common import Request, urlencode
|
||||
from . import parameters, signature, utils
|
||||
|
||||
SIGNATURE_HMAC = u"HMAC-SHA1"
|
||||
SIGNATURE_RSA = u"RSA-SHA1"
|
||||
SIGNATURE_PLAINTEXT = u"PLAINTEXT"
|
||||
SIGNATURE_METHODS = (SIGNATURE_HMAC, SIGNATURE_RSA, SIGNATURE_PLAINTEXT)
|
||||
|
||||
SIGNATURE_TYPE_AUTH_HEADER = u'AUTH_HEADER'
|
||||
SIGNATURE_TYPE_QUERY = u'QUERY'
|
||||
SIGNATURE_TYPE_BODY = u'BODY'
|
||||
|
||||
CONTENT_TYPE_FORM_URLENCODED = u'application/x-www-form-urlencoded'
|
||||
|
||||
|
||||
class Client(object):
|
||||
"""A client used to sign OAuth 1.0 RFC 5849 requests"""
|
||||
def __init__(self, client_key,
|
||||
client_secret=None,
|
||||
resource_owner_key=None,
|
||||
resource_owner_secret=None,
|
||||
callback_uri=None,
|
||||
signature_method=SIGNATURE_HMAC,
|
||||
signature_type=SIGNATURE_TYPE_AUTH_HEADER,
|
||||
rsa_key=None, verifier=None):
|
||||
self.client_key = client_key
|
||||
self.client_secret = client_secret
|
||||
self.resource_owner_key = resource_owner_key
|
||||
self.resource_owner_secret = resource_owner_secret
|
||||
self.signature_method = signature_method
|
||||
self.signature_type = signature_type
|
||||
self.callback_uri = callback_uri
|
||||
self.rsa_key = rsa_key
|
||||
self.verifier = verifier
|
||||
|
||||
if self.signature_method == SIGNATURE_RSA and self.rsa_key is None:
|
||||
raise ValueError('rsa_key is required when using RSA signature method.')
|
||||
|
||||
def get_oauth_signature(self, request):
|
||||
"""Get an OAuth signature to be used in signing a request
|
||||
"""
|
||||
if self.signature_method == SIGNATURE_PLAINTEXT:
|
||||
# fast-path
|
||||
return signature.sign_plaintext(self.client_secret,
|
||||
self.resource_owner_secret)
|
||||
|
||||
uri, headers, body = self._render(request)
|
||||
|
||||
collected_params = signature.collect_parameters(
|
||||
uri_query=urlparse.urlparse(uri).query,
|
||||
body=body,
|
||||
headers=headers)
|
||||
logging.debug("Collected params: {0}".format(collected_params))
|
||||
|
||||
normalized_params = signature.normalize_parameters(collected_params)
|
||||
normalized_uri = signature.normalize_base_string_uri(request.uri)
|
||||
logging.debug("Normalized params: {0}".format(normalized_params))
|
||||
logging.debug("Normalized URI: {0}".format(normalized_uri))
|
||||
|
||||
base_string = signature.construct_base_string(request.http_method,
|
||||
normalized_uri, normalized_params)
|
||||
|
||||
logging.debug("Base signing string: {0}".format(base_string))
|
||||
|
||||
if self.signature_method == SIGNATURE_HMAC:
|
||||
sig = signature.sign_hmac_sha1(base_string, self.client_secret,
|
||||
self.resource_owner_secret)
|
||||
elif self.signature_method == SIGNATURE_RSA:
|
||||
sig = signature.sign_rsa_sha1(base_string, self.rsa_key)
|
||||
else:
|
||||
sig = signature.sign_plaintext(self.client_secret,
|
||||
self.resource_owner_secret)
|
||||
|
||||
logging.debug("Signature: {0}".format(sig))
|
||||
return sig
|
||||
|
||||
def get_oauth_params(self):
|
||||
"""Get the basic OAuth parameters to be used in generating a signature.
|
||||
"""
|
||||
params = [
|
||||
(u'oauth_nonce', utils.generate_nonce()),
|
||||
(u'oauth_timestamp', utils.generate_timestamp()),
|
||||
(u'oauth_version', u'1.0'),
|
||||
(u'oauth_signature_method', self.signature_method),
|
||||
(u'oauth_consumer_key', self.client_key),
|
||||
]
|
||||
if self.resource_owner_key:
|
||||
params.append((u'oauth_token', self.resource_owner_key))
|
||||
if self.callback_uri:
|
||||
params.append((u'oauth_callback', self.callback_uri))
|
||||
if self.verifier:
|
||||
params.append((u'oauth_verifier', self.verifier))
|
||||
|
||||
return params
|
||||
|
||||
def _render(self, request, formencode=False):
|
||||
"""Render a signed request according to signature type
|
||||
|
||||
Returns a 3-tuple containing the request URI, headers, and body.
|
||||
|
||||
If the formencode argument is True and the body contains parameters, it
|
||||
is escaped and returned as a valid formencoded string.
|
||||
"""
|
||||
# TODO what if there are body params on a header-type auth?
|
||||
# TODO what if there are query params on a body-type auth?
|
||||
|
||||
uri, headers, body = request.uri, request.headers, request.body
|
||||
|
||||
# TODO: right now these prepare_* methods are very narrow in scope--they
|
||||
# only affect their little thing. In some cases (for example, with
|
||||
# header auth) it might be advantageous to allow these methods to touch
|
||||
# other parts of the request, like the headers—so the prepare_headers
|
||||
# method could also set the Content-Type header to x-www-form-urlencoded
|
||||
# like the spec requires. This would be a fundamental change though, and
|
||||
# I'm not sure how I feel about it.
|
||||
if self.signature_type == SIGNATURE_TYPE_AUTH_HEADER:
|
||||
headers = parameters.prepare_headers(request.oauth_params, request.headers)
|
||||
elif self.signature_type == SIGNATURE_TYPE_BODY and request.decoded_body is not None:
|
||||
body = parameters.prepare_form_encoded_body(request.oauth_params, request.decoded_body)
|
||||
if formencode:
|
||||
body = urlencode(body)
|
||||
headers['Content-Type'] = u'application/x-www-form-urlencoded'
|
||||
elif self.signature_type == SIGNATURE_TYPE_QUERY:
|
||||
uri = parameters.prepare_request_uri_query(request.oauth_params, request.uri)
|
||||
else:
|
||||
raise ValueError('Unknown signature type specified.')
|
||||
|
||||
return uri, headers, body
|
||||
|
||||
def sign(self, uri, http_method=u'GET', body=None, headers=None):
|
||||
"""Sign a request
|
||||
|
||||
Signs an HTTP request with the specified parts.
|
||||
|
||||
Returns a 3-tuple of the signed request's URI, headers, and body.
|
||||
Note that http_method is not returned as it is unaffected by the OAuth
|
||||
signing process.
|
||||
|
||||
The body argument may be a dict, a list of 2-tuples, or a formencoded
|
||||
string. The Content-Type header must be 'application/x-www-form-urlencoded'
|
||||
if it is present.
|
||||
|
||||
If the body argument is not one of the above, it will be returned
|
||||
verbatim as it is unaffected by the OAuth signing process. Attempting to
|
||||
sign a request with non-formencoded data using the OAuth body signature
|
||||
type is invalid and will raise an exception.
|
||||
|
||||
If the body does contain parameters, it will be returned as a properly-
|
||||
formatted formencoded string.
|
||||
|
||||
All string data MUST be unicode. This includes strings inside body
|
||||
dicts, for example.
|
||||
"""
|
||||
# normalize request data
|
||||
request = Request(uri, http_method, body, headers)
|
||||
|
||||
# sanity check
|
||||
content_type = request.headers.get('Content-Type', None)
|
||||
multipart = content_type and content_type.startswith('multipart/')
|
||||
should_have_params = content_type == CONTENT_TYPE_FORM_URLENCODED
|
||||
has_params = request.decoded_body is not None
|
||||
# 3.4.1.3.1. Parameter Sources
|
||||
# [Parameters are collected from the HTTP request entity-body, but only
|
||||
# if [...]:
|
||||
# * The entity-body is single-part.
|
||||
if multipart and has_params:
|
||||
raise ValueError("Headers indicate a multipart body but body contains parameters.")
|
||||
# * The entity-body follows the encoding requirements of the
|
||||
# "application/x-www-form-urlencoded" content-type as defined by
|
||||
# [W3C.REC-html40-19980424].
|
||||
elif should_have_params and not has_params:
|
||||
raise ValueError("Headers indicate a formencoded body but body was not decodable.")
|
||||
# * The HTTP request entity-header includes the "Content-Type"
|
||||
# header field set to "application/x-www-form-urlencoded".
|
||||
elif not should_have_params and has_params:
|
||||
raise ValueError("Body contains parameters but Content-Type header was not set.")
|
||||
|
||||
# 3.5.2. Form-Encoded Body
|
||||
# Protocol parameters can be transmitted in the HTTP request entity-
|
||||
# body, but only if the following REQUIRED conditions are met:
|
||||
# o The entity-body is single-part.
|
||||
# o The entity-body follows the encoding requirements of the
|
||||
# "application/x-www-form-urlencoded" content-type as defined by
|
||||
# [W3C.REC-html40-19980424].
|
||||
# o The HTTP request entity-header includes the "Content-Type" header
|
||||
# field set to "application/x-www-form-urlencoded".
|
||||
elif self.signature_type == SIGNATURE_TYPE_BODY and not (
|
||||
should_have_params and has_params and not multipart):
|
||||
raise ValueError('Body signatures may only be used with form-urlencoded content')
|
||||
|
||||
# generate the basic OAuth parameters
|
||||
request.oauth_params = self.get_oauth_params()
|
||||
|
||||
# generate the signature
|
||||
request.oauth_params.append((u'oauth_signature', self.get_oauth_signature(request)))
|
||||
|
||||
# render the signed request and return it
|
||||
return self._render(request, formencode=True)
|
||||
|
||||
|
||||
class Server(object):
|
||||
"""A server used to verify OAuth 1.0 RFC 5849 requests"""
|
||||
def __init__(self, signature_method=SIGNATURE_HMAC, rsa_key=None):
|
||||
self.signature_method = signature_method
|
||||
self.rsa_key = rsa_key
|
||||
|
||||
def get_client_secret(self, client_key):
|
||||
raise NotImplementedError("Subclasses must implement this function.")
|
||||
|
||||
def get_resource_owner_secret(self, resource_owner_key):
|
||||
raise NotImplementedError("Subclasses must implement this function.")
|
||||
|
||||
def get_signature_type_and_params(self, uri_query, headers, body):
|
||||
signature_types_with_oauth_params = filter(lambda s: s[1], (
|
||||
(SIGNATURE_TYPE_AUTH_HEADER, utils.filter_oauth_params(
|
||||
signature.collect_parameters(headers=headers,
|
||||
exclude_oauth_signature=False))),
|
||||
(SIGNATURE_TYPE_BODY, utils.filter_oauth_params(
|
||||
signature.collect_parameters(body=body,
|
||||
exclude_oauth_signature=False))),
|
||||
(SIGNATURE_TYPE_QUERY, utils.filter_oauth_params(
|
||||
signature.collect_parameters(uri_query=uri_query,
|
||||
exclude_oauth_signature=False))),
|
||||
))
|
||||
|
||||
if len(signature_types_with_oauth_params) > 1:
|
||||
raise ValueError('oauth_ params must come from only 1 signature type but were found in %s' % ', '.join(
|
||||
[s[0] for s in signature_types_with_oauth_params]))
|
||||
try:
|
||||
signature_type, params = signature_types_with_oauth_params[0]
|
||||
except IndexError:
|
||||
raise ValueError('oauth_ params are missing. Could not determine signature type.')
|
||||
|
||||
return signature_type, dict(params)
|
||||
|
||||
def check_client_key(self, client_key):
|
||||
raise NotImplementedError("Subclasses must implement this function.")
|
||||
|
||||
def check_resource_owner_key(self, client_key, resource_owner_key):
|
||||
raise NotImplementedError("Subclasses must implement this function.")
|
||||
|
||||
def check_timestamp_and_nonce(self, timestamp, nonce):
|
||||
raise NotImplementedError("Subclasses must implement this function.")
|
||||
|
||||
def check_request_signature(self, uri, http_method=u'GET', body='',
|
||||
headers=None):
|
||||
"""Check a request's supplied signature to make sure the request is
|
||||
valid.
|
||||
|
||||
Servers should return HTTP status 400 if a ValueError exception
|
||||
is raised and HTTP status 401 on return value False.
|
||||
|
||||
Per `section 3.2`_ of the spec.
|
||||
|
||||
.. _`section 3.2`: http://tools.ietf.org/html/rfc5849#section-3.2
|
||||
"""
|
||||
headers = headers or {}
|
||||
signature_type = None
|
||||
# FIXME: urlparse does not return unicode!
|
||||
uri_query = urlparse.urlparse(uri).query
|
||||
|
||||
signature_type, params = self.get_signature_type_and_params(uri_query,
|
||||
headers, body)
|
||||
|
||||
# the parameters may not include duplicate oauth entries
|
||||
filtered_params = utils.filter_oauth_params(params)
|
||||
if len(filtered_params) != len(params):
|
||||
raise ValueError("Duplicate OAuth entries.")
|
||||
|
||||
params = dict(params)
|
||||
request_signature = params.get(u'oauth_signature')
|
||||
client_key = params.get(u'oauth_consumer_key')
|
||||
resource_owner_key = params.get(u'oauth_token')
|
||||
nonce = params.get(u'oauth_nonce')
|
||||
timestamp = params.get(u'oauth_timestamp')
|
||||
callback_uri = params.get(u'oauth_callback')
|
||||
verifier = params.get(u'oauth_verifier')
|
||||
signature_method = params.get(u'oauth_signature_method')
|
||||
|
||||
# ensure all mandatory parameters are present
|
||||
if not all((request_signature, client_key, nonce,
|
||||
timestamp, signature_method)):
|
||||
raise ValueError("Missing OAuth parameters.")
|
||||
|
||||
# if version is supplied, it must be "1.0"
|
||||
if u'oauth_version' in params and params[u'oauth_version'] != u'1.0':
|
||||
raise ValueError("Invalid OAuth version.")
|
||||
|
||||
# signature method must be valid
|
||||
if not signature_method in SIGNATURE_METHODS:
|
||||
raise ValueError("Invalid signature method.")
|
||||
|
||||
# ensure client key is valid
|
||||
if not self.check_client_key(client_key):
|
||||
return False
|
||||
|
||||
# ensure resource owner key is valid and not expired
|
||||
if not self.check_resource_owner_key(client_key, resource_owner_key):
|
||||
return False
|
||||
|
||||
# ensure the nonce and timestamp haven't been used before
|
||||
if not self.check_timestamp_and_nonce(timestamp, nonce):
|
||||
return False
|
||||
|
||||
# FIXME: extract realm, then self.check_realm
|
||||
|
||||
# oauth_client parameters depend on client chosen signature method
|
||||
# which may vary for each request, section 3.4
|
||||
# HMAC-SHA1 and PLAINTEXT share parameters
|
||||
if signature_method == SIGNATURE_RSA:
|
||||
oauth_client = Client(client_key,
|
||||
resource_owner_key=resource_owner_key,
|
||||
callback_uri=callback_uri,
|
||||
signature_method=signature_method,
|
||||
signature_type=signature_type,
|
||||
rsa_key=self.rsa_key, verifier=verifier)
|
||||
else:
|
||||
client_secret = self.get_client_secret(client_key)
|
||||
resource_owner_secret = self.get_resource_owner_secret(
|
||||
resource_owner_key)
|
||||
oauth_client = Client(client_key,
|
||||
client_secret=client_secret,
|
||||
resource_owner_key=resource_owner_key,
|
||||
resource_owner_secret=resource_owner_secret,
|
||||
callback_uri=callback_uri,
|
||||
signature_method=signature_method,
|
||||
signature_type=signature_type,
|
||||
verifier=verifier)
|
||||
|
||||
request = Request(uri, http_method, body, headers)
|
||||
request.oauth_params = params
|
||||
|
||||
client_signature = oauth_client.get_oauth_signature(request)
|
||||
|
||||
# FIXME: use near constant time string compare to avoid timing attacks
|
||||
return client_signature == request_signature
|
||||
134
libs/oauthlib/oauth1/rfc5849/parameters.py
Normal file
134
libs/oauthlib/oauth1/rfc5849/parameters.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
"""
|
||||
oauthlib.parameters
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains methods related to `section 3.5`_ of the OAuth 1.0a spec.
|
||||
|
||||
.. _`section 3.5`: http://tools.ietf.org/html/rfc5849#section-3.5
|
||||
"""
|
||||
|
||||
from urlparse import urlparse, urlunparse
|
||||
from . import utils
|
||||
from oauthlib.common import extract_params, urlencode
|
||||
|
||||
|
||||
# TODO: do we need filter_params now that oauth_params are handled by Request?
|
||||
# We can easily pass in just oauth protocol params.
|
||||
@utils.filter_params
|
||||
def prepare_headers(oauth_params, headers=None, realm=None):
|
||||
"""**Prepare the Authorization header.**
|
||||
Per `section 3.5.1`_ of the spec.
|
||||
|
||||
Protocol parameters can be transmitted using the HTTP "Authorization"
|
||||
header field as defined by `RFC2617`_ with the auth-scheme name set to
|
||||
"OAuth" (case insensitive).
|
||||
|
||||
For example::
|
||||
|
||||
Authorization: OAuth realm="Example",
|
||||
oauth_consumer_key="0685bd9184jfhq22",
|
||||
oauth_token="ad180jjd733klru7",
|
||||
oauth_signature_method="HMAC-SHA1",
|
||||
oauth_signature="wOJIO9A2W5mFwDgiDvZbTSMK%2FPY%3D",
|
||||
oauth_timestamp="137131200",
|
||||
oauth_nonce="4572616e48616d6d65724c61686176",
|
||||
oauth_version="1.0"
|
||||
|
||||
|
||||
.. _`section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1
|
||||
.. _`RFC2617`: http://tools.ietf.org/html/rfc2617
|
||||
"""
|
||||
headers = headers or {}
|
||||
|
||||
# Protocol parameters SHALL be included in the "Authorization" header
|
||||
# field as follows:
|
||||
authorization_header_parameters_parts = []
|
||||
for oauth_parameter_name, value in oauth_params:
|
||||
# 1. Parameter names and values are encoded per Parameter Encoding
|
||||
# (`Section 3.6`_)
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
escaped_name = utils.escape(oauth_parameter_name)
|
||||
escaped_value = utils.escape(value)
|
||||
|
||||
# 2. Each parameter's name is immediately followed by an "=" character
|
||||
# (ASCII code 61), a """ character (ASCII code 34), the parameter
|
||||
# value (MAY be empty), and another """ character (ASCII code 34).
|
||||
part = u'{0}="{1}"'.format(escaped_name, escaped_value)
|
||||
|
||||
authorization_header_parameters_parts.append(part)
|
||||
|
||||
# 3. Parameters are separated by a "," character (ASCII code 44) and
|
||||
# OPTIONAL linear whitespace per `RFC2617`_.
|
||||
#
|
||||
# .. _`RFC2617`: http://tools.ietf.org/html/rfc2617
|
||||
authorization_header_parameters = ', '.join(
|
||||
authorization_header_parameters_parts)
|
||||
|
||||
# 4. The OPTIONAL "realm" parameter MAY be added and interpreted per
|
||||
# `RFC2617 section 1.2`_.
|
||||
#
|
||||
# .. _`RFC2617 section 1.2`: http://tools.ietf.org/html/rfc2617#section-1.2
|
||||
if realm:
|
||||
# NOTE: realm should *not* be escaped
|
||||
authorization_header_parameters = (u'realm="%s", ' % realm +
|
||||
authorization_header_parameters)
|
||||
|
||||
# the auth-scheme name set to "OAuth" (case insensitive).
|
||||
authorization_header = u'OAuth %s' % authorization_header_parameters
|
||||
|
||||
# contribute the Authorization header to the given headers
|
||||
full_headers = {}
|
||||
full_headers.update(headers)
|
||||
full_headers[u'Authorization'] = authorization_header
|
||||
return full_headers
|
||||
|
||||
|
||||
def _append_params(oauth_params, params):
|
||||
"""Append OAuth params to an existing set of parameters.
|
||||
|
||||
Both params and oauth_params is must be lists of 2-tuples.
|
||||
|
||||
Per `section 3.5.2`_ and `3.5.3`_ of the spec.
|
||||
|
||||
.. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2
|
||||
.. _`3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3
|
||||
|
||||
"""
|
||||
merged = list(params)
|
||||
merged.extend(oauth_params)
|
||||
# The request URI / entity-body MAY include other request-specific
|
||||
# parameters, in which case, the protocol parameters SHOULD be appended
|
||||
# following the request-specific parameters, properly separated by an "&"
|
||||
# character (ASCII code 38)
|
||||
merged.sort(key=lambda i: i[0].startswith('oauth_'))
|
||||
return merged
|
||||
|
||||
|
||||
def prepare_form_encoded_body(oauth_params, body):
|
||||
"""Prepare the Form-Encoded Body.
|
||||
|
||||
Per `section 3.5.2`_ of the spec.
|
||||
|
||||
.. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2
|
||||
|
||||
"""
|
||||
# append OAuth params to the existing body
|
||||
return _append_params(oauth_params, body)
|
||||
|
||||
|
||||
def prepare_request_uri_query(oauth_params, uri):
|
||||
"""Prepare the Request URI Query.
|
||||
|
||||
Per `section 3.5.3`_ of the spec.
|
||||
|
||||
.. _`section 3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3
|
||||
|
||||
"""
|
||||
# append OAuth params to the existing set of query components
|
||||
sch, net, path, par, query, fra = urlparse(uri)
|
||||
query = urlencode(_append_params(oauth_params, extract_params(query) or []))
|
||||
return urlunparse((sch, net, path, par, query, fra))
|
||||
501
libs/oauthlib/oauth1/rfc5849/signature.py
Normal file
501
libs/oauthlib/oauth1/rfc5849/signature.py
Normal file
@@ -0,0 +1,501 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
"""
|
||||
oauthlib.oauth1.rfc5849.signature
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module represents a direct implementation of `section 3.4`_ of the spec.
|
||||
|
||||
Terminology:
|
||||
* Client: software interfacing with an OAuth API
|
||||
* Server: the API provider
|
||||
* Resource Owner: the user who is granting authorization to the client
|
||||
|
||||
Steps for signing a request:
|
||||
|
||||
1. Collect parameters from the uri query, auth header, & body
|
||||
2. Normalize those parameters
|
||||
3. Normalize the uri
|
||||
4. Pass the normalized uri, normalized parameters, and http method to
|
||||
construct the base string
|
||||
5. Pass the base string and any keys needed to a signing function
|
||||
|
||||
.. _`section 3.4`: http://tools.ietf.org/html/rfc5849#section-3.4
|
||||
"""
|
||||
import binascii
|
||||
import hashlib
|
||||
import hmac
|
||||
import urlparse
|
||||
from . import utils
|
||||
from oauthlib.common import extract_params
|
||||
|
||||
|
||||
def construct_base_string(http_method, base_string_uri,
|
||||
normalized_encoded_request_parameters):
|
||||
"""**String Construction**
|
||||
Per `section 3.4.1.1`_ of the spec.
|
||||
|
||||
For example, the HTTP request::
|
||||
|
||||
POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1
|
||||
Host: example.com
|
||||
Content-Type: application/x-www-form-urlencoded
|
||||
Authorization: OAuth realm="Example",
|
||||
oauth_consumer_key="9djdj82h48djs9d2",
|
||||
oauth_token="kkk9d7dh3k39sjv7",
|
||||
oauth_signature_method="HMAC-SHA1",
|
||||
oauth_timestamp="137131201",
|
||||
oauth_nonce="7d8f3e4a",
|
||||
oauth_signature="bYT5CMsGcbgUdFHObYMEfcx6bsw%3D"
|
||||
|
||||
c2&a3=2+q
|
||||
|
||||
is represented by the following signature base string (line breaks
|
||||
are for display purposes only)::
|
||||
|
||||
POST&http%3A%2F%2Fexample.com%2Frequest&a2%3Dr%2520b%26a3%3D2%2520q
|
||||
%26a3%3Da%26b5%3D%253D%25253D%26c%2540%3D%26c2%3D%26oauth_consumer_
|
||||
key%3D9djdj82h48djs9d2%26oauth_nonce%3D7d8f3e4a%26oauth_signature_m
|
||||
ethod%3DHMAC-SHA1%26oauth_timestamp%3D137131201%26oauth_token%3Dkkk
|
||||
9d7dh3k39sjv7
|
||||
|
||||
.. _`section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1
|
||||
"""
|
||||
|
||||
# The signature base string is constructed by concatenating together,
|
||||
# in order, the following HTTP request elements:
|
||||
|
||||
# 1. The HTTP request method in uppercase. For example: "HEAD",
|
||||
# "GET", "POST", etc. If the request uses a custom HTTP method, it
|
||||
# MUST be encoded (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
base_string = utils.escape(http_method.upper())
|
||||
|
||||
# 2. An "&" character (ASCII code 38).
|
||||
base_string += u'&'
|
||||
|
||||
# 3. The base string URI from `Section 3.4.1.2`_, after being encoded
|
||||
# (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2
|
||||
# .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6
|
||||
base_string += utils.escape(base_string_uri)
|
||||
|
||||
# 4. An "&" character (ASCII code 38).
|
||||
base_string += u'&'
|
||||
|
||||
# 5. The request parameters as normalized in `Section 3.4.1.3.2`_, after
|
||||
# being encoded (`Section 3.6`).
|
||||
#
|
||||
# .. _`Section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2
|
||||
# .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6
|
||||
base_string += utils.escape(normalized_encoded_request_parameters)
|
||||
|
||||
return base_string
|
||||
|
||||
|
||||
def normalize_base_string_uri(uri):
|
||||
"""**Base String URI**
|
||||
Per `section 3.4.1.2`_ of the spec.
|
||||
|
||||
For example, the HTTP request::
|
||||
|
||||
GET /r%20v/X?id=123 HTTP/1.1
|
||||
Host: EXAMPLE.COM:80
|
||||
|
||||
is represented by the base string URI: "http://example.com/r%20v/X".
|
||||
|
||||
In another example, the HTTPS request::
|
||||
|
||||
GET /?q=1 HTTP/1.1
|
||||
Host: www.example.net:8080
|
||||
|
||||
is represented by the base string URI: "https://www.example.net:8080/".
|
||||
|
||||
.. _`section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2
|
||||
"""
|
||||
if not isinstance(uri, unicode):
|
||||
raise ValueError('uri must be a unicode object.')
|
||||
|
||||
# FIXME: urlparse does not support unicode
|
||||
scheme, netloc, path, params, query, fragment = urlparse.urlparse(uri)
|
||||
|
||||
# The scheme, authority, and path of the request resource URI `RFC3986`
|
||||
# are included by constructing an "http" or "https" URI representing
|
||||
# the request resource (without the query or fragment) as follows:
|
||||
#
|
||||
# .. _`RFC2616`: http://tools.ietf.org/html/rfc3986
|
||||
|
||||
# 1. The scheme and host MUST be in lowercase.
|
||||
scheme = scheme.lower()
|
||||
netloc = netloc.lower()
|
||||
|
||||
# 2. The host and port values MUST match the content of the HTTP
|
||||
# request "Host" header field.
|
||||
# TODO: enforce this constraint
|
||||
|
||||
# 3. The port MUST be included if it is not the default port for the
|
||||
# scheme, and MUST be excluded if it is the default. Specifically,
|
||||
# the port MUST be excluded when making an HTTP request `RFC2616`_
|
||||
# to port 80 or when making an HTTPS request `RFC2818`_ to port 443.
|
||||
# All other non-default port numbers MUST be included.
|
||||
#
|
||||
# .. _`RFC2616`: http://tools.ietf.org/html/rfc2616
|
||||
# .. _`RFC2818`: http://tools.ietf.org/html/rfc2818
|
||||
default_ports = (
|
||||
(u'http', u'80'),
|
||||
(u'https', u'443'),
|
||||
)
|
||||
if u':' in netloc:
|
||||
host, port = netloc.split(u':', 1)
|
||||
if (scheme, port) in default_ports:
|
||||
netloc = host
|
||||
|
||||
return urlparse.urlunparse((scheme, netloc, path, u'', u'', u''))
|
||||
|
||||
|
||||
# ** Request Parameters **
|
||||
#
|
||||
# Per `section 3.4.1.3`_ of the spec.
|
||||
#
|
||||
# In order to guarantee a consistent and reproducible representation of
|
||||
# the request parameters, the parameters are collected and decoded to
|
||||
# their original decoded form. They are then sorted and encoded in a
|
||||
# particular manner that is often different from their original
|
||||
# encoding scheme, and concatenated into a single string.
|
||||
#
|
||||
# .. _`section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3
|
||||
|
||||
def collect_parameters(uri_query='', body=[], headers=None,
|
||||
exclude_oauth_signature=True):
|
||||
"""**Parameter Sources**
|
||||
|
||||
Parameters starting with `oauth_` will be unescaped.
|
||||
|
||||
Body parameters must be supplied as a dict, a list of 2-tuples, or a
|
||||
formencoded query string.
|
||||
|
||||
Headers must be supplied as a dict.
|
||||
|
||||
Per `section 3.4.1.3.1`_ of the spec.
|
||||
|
||||
For example, the HTTP request::
|
||||
|
||||
POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1
|
||||
Host: example.com
|
||||
Content-Type: application/x-www-form-urlencoded
|
||||
Authorization: OAuth realm="Example",
|
||||
oauth_consumer_key="9djdj82h48djs9d2",
|
||||
oauth_token="kkk9d7dh3k39sjv7",
|
||||
oauth_signature_method="HMAC-SHA1",
|
||||
oauth_timestamp="137131201",
|
||||
oauth_nonce="7d8f3e4a",
|
||||
oauth_signature="djosJKDKJSD8743243%2Fjdk33klY%3D"
|
||||
|
||||
c2&a3=2+q
|
||||
|
||||
contains the following (fully decoded) parameters used in the
|
||||
signature base sting::
|
||||
|
||||
+------------------------+------------------+
|
||||
| Name | Value |
|
||||
+------------------------+------------------+
|
||||
| b5 | =%3D |
|
||||
| a3 | a |
|
||||
| c@ | |
|
||||
| a2 | r b |
|
||||
| oauth_consumer_key | 9djdj82h48djs9d2 |
|
||||
| oauth_token | kkk9d7dh3k39sjv7 |
|
||||
| oauth_signature_method | HMAC-SHA1 |
|
||||
| oauth_timestamp | 137131201 |
|
||||
| oauth_nonce | 7d8f3e4a |
|
||||
| c2 | |
|
||||
| a3 | 2 q |
|
||||
+------------------------+------------------+
|
||||
|
||||
Note that the value of "b5" is "=%3D" and not "==". Both "c@" and
|
||||
"c2" have empty values. While the encoding rules specified in this
|
||||
specification for the purpose of constructing the signature base
|
||||
string exclude the use of a "+" character (ASCII code 43) to
|
||||
represent an encoded space character (ASCII code 32), this practice
|
||||
is widely used in "application/x-www-form-urlencoded" encoded values,
|
||||
and MUST be properly decoded, as demonstrated by one of the "a3"
|
||||
parameter instances (the "a3" parameter is used twice in this
|
||||
request).
|
||||
|
||||
.. _`section 3.4.1.3.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.1
|
||||
"""
|
||||
headers = headers or {}
|
||||
params = []
|
||||
|
||||
# The parameters from the following sources are collected into a single
|
||||
# list of name/value pairs:
|
||||
|
||||
# * The query component of the HTTP request URI as defined by
|
||||
# `RFC3986, Section 3.4`_. The query component is parsed into a list
|
||||
# of name/value pairs by treating it as an
|
||||
# "application/x-www-form-urlencoded" string, separating the names
|
||||
# and values and decoding them as defined by
|
||||
# `W3C.REC-html40-19980424`_, Section 17.13.4.
|
||||
#
|
||||
# .. _`RFC3986, Section 3.4`: http://tools.ietf.org/html/rfc3986#section-3.4
|
||||
# .. _`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424
|
||||
if uri_query:
|
||||
params.extend(urlparse.parse_qsl(uri_query, keep_blank_values=True))
|
||||
|
||||
# * The OAuth HTTP "Authorization" header field (`Section 3.5.1`_) if
|
||||
# present. The header's content is parsed into a list of name/value
|
||||
# pairs excluding the "realm" parameter if present. The parameter
|
||||
# values are decoded as defined by `Section 3.5.1`_.
|
||||
#
|
||||
# .. _`Section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1
|
||||
if headers:
|
||||
headers_lower = dict((k.lower(), v) for k, v in headers.items())
|
||||
authorization_header = headers_lower.get(u'authorization')
|
||||
if authorization_header is not None:
|
||||
params.extend([i for i in utils.parse_authorization_header(
|
||||
authorization_header) if i[0] != u'realm'])
|
||||
|
||||
# * The HTTP request entity-body, but only if all of the following
|
||||
# conditions are met:
|
||||
# * The entity-body is single-part.
|
||||
#
|
||||
# * The entity-body follows the encoding requirements of the
|
||||
# "application/x-www-form-urlencoded" content-type as defined by
|
||||
# `W3C.REC-html40-19980424`_.
|
||||
|
||||
# * The HTTP request entity-header includes the "Content-Type"
|
||||
# header field set to "application/x-www-form-urlencoded".
|
||||
#
|
||||
# .._`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424
|
||||
|
||||
# TODO: enforce header param inclusion conditions
|
||||
bodyparams = extract_params(body) or []
|
||||
params.extend(bodyparams)
|
||||
|
||||
# ensure all oauth params are unescaped
|
||||
unescaped_params = []
|
||||
for k, v in params:
|
||||
if k.startswith(u'oauth_'):
|
||||
v = utils.unescape(v)
|
||||
unescaped_params.append((k, v))
|
||||
|
||||
# The "oauth_signature" parameter MUST be excluded from the signature
|
||||
# base string if present.
|
||||
if exclude_oauth_signature:
|
||||
unescaped_params = filter(lambda i: i[0] != u'oauth_signature',
|
||||
unescaped_params)
|
||||
|
||||
return unescaped_params
|
||||
|
||||
|
||||
def normalize_parameters(params):
|
||||
"""**Parameters Normalization**
|
||||
Per `section 3.4.1.3.2`_ of the spec.
|
||||
|
||||
For example, the list of parameters from the previous section would
|
||||
be normalized as follows:
|
||||
|
||||
Encoded::
|
||||
|
||||
+------------------------+------------------+
|
||||
| Name | Value |
|
||||
+------------------------+------------------+
|
||||
| b5 | %3D%253D |
|
||||
| a3 | a |
|
||||
| c%40 | |
|
||||
| a2 | r%20b |
|
||||
| oauth_consumer_key | 9djdj82h48djs9d2 |
|
||||
| oauth_token | kkk9d7dh3k39sjv7 |
|
||||
| oauth_signature_method | HMAC-SHA1 |
|
||||
| oauth_timestamp | 137131201 |
|
||||
| oauth_nonce | 7d8f3e4a |
|
||||
| c2 | |
|
||||
| a3 | 2%20q |
|
||||
+------------------------+------------------+
|
||||
|
||||
Sorted::
|
||||
|
||||
+------------------------+------------------+
|
||||
| Name | Value |
|
||||
+------------------------+------------------+
|
||||
| a2 | r%20b |
|
||||
| a3 | 2%20q |
|
||||
| a3 | a |
|
||||
| b5 | %3D%253D |
|
||||
| c%40 | |
|
||||
| c2 | |
|
||||
| oauth_consumer_key | 9djdj82h48djs9d2 |
|
||||
| oauth_nonce | 7d8f3e4a |
|
||||
| oauth_signature_method | HMAC-SHA1 |
|
||||
| oauth_timestamp | 137131201 |
|
||||
| oauth_token | kkk9d7dh3k39sjv7 |
|
||||
+------------------------+------------------+
|
||||
|
||||
Concatenated Pairs::
|
||||
|
||||
+-------------------------------------+
|
||||
| Name=Value |
|
||||
+-------------------------------------+
|
||||
| a2=r%20b |
|
||||
| a3=2%20q |
|
||||
| a3=a |
|
||||
| b5=%3D%253D |
|
||||
| c%40= |
|
||||
| c2= |
|
||||
| oauth_consumer_key=9djdj82h48djs9d2 |
|
||||
| oauth_nonce=7d8f3e4a |
|
||||
| oauth_signature_method=HMAC-SHA1 |
|
||||
| oauth_timestamp=137131201 |
|
||||
| oauth_token=kkk9d7dh3k39sjv7 |
|
||||
+-------------------------------------+
|
||||
|
||||
and concatenated together into a single string (line breaks are for
|
||||
display purposes only)::
|
||||
|
||||
a2=r%20b&a3=2%20q&a3=a&b5=%3D%253D&c%40=&c2=&oauth_consumer_key=9dj
|
||||
dj82h48djs9d2&oauth_nonce=7d8f3e4a&oauth_signature_method=HMAC-SHA1
|
||||
&oauth_timestamp=137131201&oauth_token=kkk9d7dh3k39sjv7
|
||||
|
||||
.. _`section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2
|
||||
"""
|
||||
|
||||
# The parameters collected in `Section 3.4.1.3`_ are normalized into a
|
||||
# single string as follows:
|
||||
#
|
||||
# .. _`Section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3
|
||||
|
||||
# 1. First, the name and value of each parameter are encoded
|
||||
# (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
key_values = [(utils.escape(k), utils.escape(v)) for k, v in params]
|
||||
|
||||
# 2. The parameters are sorted by name, using ascending byte value
|
||||
# ordering. If two or more parameters share the same name, they
|
||||
# are sorted by their value.
|
||||
key_values.sort()
|
||||
|
||||
# 3. The name of each parameter is concatenated to its corresponding
|
||||
# value using an "=" character (ASCII code 61) as a separator, even
|
||||
# if the value is empty.
|
||||
parameter_parts = [u'{0}={1}'.format(k, v) for k, v in key_values]
|
||||
|
||||
# 4. The sorted name/value pairs are concatenated together into a
|
||||
# single string by using an "&" character (ASCII code 38) as
|
||||
# separator.
|
||||
return u'&'.join(parameter_parts)
|
||||
|
||||
|
||||
def sign_hmac_sha1(base_string, client_secret, resource_owner_secret):
|
||||
"""**HMAC-SHA1**
|
||||
|
||||
The "HMAC-SHA1" signature method uses the HMAC-SHA1 signature
|
||||
algorithm as defined in `RFC2104`_::
|
||||
|
||||
digest = HMAC-SHA1 (key, text)
|
||||
|
||||
Per `section 3.4.2`_ of the spec.
|
||||
|
||||
.. _`RFC2104`: http://tools.ietf.org/html/rfc2104
|
||||
.. _`section 3.4.2`: http://tools.ietf.org/html/rfc5849#section-3.4.2
|
||||
"""
|
||||
|
||||
# The HMAC-SHA1 function variables are used in following way:
|
||||
|
||||
# text is set to the value of the signature base string from
|
||||
# `Section 3.4.1.1`_.
|
||||
#
|
||||
# .. _`Section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1
|
||||
text = base_string
|
||||
|
||||
# key is set to the concatenated values of:
|
||||
# 1. The client shared-secret, after being encoded (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
key = utils.escape(client_secret or u'')
|
||||
|
||||
# 2. An "&" character (ASCII code 38), which MUST be included
|
||||
# even when either secret is empty.
|
||||
key += u'&'
|
||||
|
||||
# 3. The token shared-secret, after being encoded (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
key += utils.escape(resource_owner_secret or u'')
|
||||
|
||||
# FIXME: HMAC does not support unicode!
|
||||
key_utf8 = key.encode('utf-8')
|
||||
text_utf8 = text.encode('utf-8')
|
||||
signature = hmac.new(key_utf8, text_utf8, hashlib.sha1)
|
||||
|
||||
# digest is used to set the value of the "oauth_signature" protocol
|
||||
# parameter, after the result octet string is base64-encoded
|
||||
# per `RFC2045, Section 6.8`.
|
||||
#
|
||||
# .. _`RFC2045, Section 6.8`: http://tools.ietf.org/html/rfc2045#section-6.8
|
||||
return binascii.b2a_base64(signature.digest())[:-1].decode('utf-8')
|
||||
|
||||
|
||||
def sign_rsa_sha1(base_string, rsa_private_key):
|
||||
"""**RSA-SHA1**
|
||||
|
||||
Per `section 3.4.3`_ of the spec.
|
||||
|
||||
The "RSA-SHA1" signature method uses the RSASSA-PKCS1-v1_5 signature
|
||||
algorithm as defined in `RFC3447, Section 8.2`_ (also known as
|
||||
PKCS#1), using SHA-1 as the hash function for EMSA-PKCS1-v1_5. To
|
||||
use this method, the client MUST have established client credentials
|
||||
with the server that included its RSA public key (in a manner that is
|
||||
beyond the scope of this specification).
|
||||
|
||||
NOTE: this method requires the python-rsa library.
|
||||
|
||||
.. _`section 3.4.3`: http://tools.ietf.org/html/rfc5849#section-3.4.3
|
||||
.. _`RFC3447, Section 8.2`: http://tools.ietf.org/html/rfc3447#section-8.2
|
||||
|
||||
"""
|
||||
|
||||
# TODO: finish RSA documentation
|
||||
|
||||
import rsa
|
||||
key = rsa.PrivateKey.load_pkcs1(rsa_private_key)
|
||||
sig = rsa.sign(base_string, key, 'SHA-1')
|
||||
return binascii.b2a_base64(sig)[:-1]
|
||||
|
||||
|
||||
def sign_plaintext(client_secret, resource_owner_secret):
|
||||
"""Sign a request using plaintext.
|
||||
|
||||
Per `section 3.4.4`_ of the spec.
|
||||
|
||||
The "PLAINTEXT" method does not employ a signature algorithm. It
|
||||
MUST be used with a transport-layer mechanism such as TLS or SSL (or
|
||||
sent over a secure channel with equivalent protections). It does not
|
||||
utilize the signature base string or the "oauth_timestamp" and
|
||||
"oauth_nonce" parameters.
|
||||
|
||||
.. _`section 3.4.4`: http://tools.ietf.org/html/rfc5849#section-3.4.4
|
||||
|
||||
"""
|
||||
|
||||
# The "oauth_signature" protocol parameter is set to the concatenated
|
||||
# value of:
|
||||
|
||||
# 1. The client shared-secret, after being encoded (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
signature = utils.escape(client_secret or u'')
|
||||
|
||||
# 2. An "&" character (ASCII code 38), which MUST be included even
|
||||
# when either secret is empty.
|
||||
signature += u'&'
|
||||
|
||||
# 3. The token shared-secret, after being encoded (`Section 3.6`_).
|
||||
#
|
||||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
signature += utils.escape(resource_owner_secret or u'')
|
||||
|
||||
return signature
|
||||
|
||||
141
libs/oauthlib/oauth1/rfc5849/utils.py
Normal file
141
libs/oauthlib/oauth1/rfc5849/utils.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
oauthlib.utils
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module contains utility methods used by various parts of the OAuth
|
||||
spec.
|
||||
"""
|
||||
|
||||
import string
|
||||
import time
|
||||
import urllib2
|
||||
from random import getrandbits, choice
|
||||
|
||||
from oauthlib.common import quote, unquote
|
||||
|
||||
UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') +
|
||||
string.digits.decode('ascii'))
|
||||
|
||||
|
||||
def filter_params(target):
|
||||
"""Decorator which filters params to remove non-oauth_* parameters
|
||||
|
||||
Assumes the decorated method takes a params dict or list of tuples as its
|
||||
first argument.
|
||||
"""
|
||||
def wrapper(params, *args, **kwargs):
|
||||
params = filter_oauth_params(params)
|
||||
return target(params, *args, **kwargs)
|
||||
|
||||
wrapper.__doc__ = target.__doc__
|
||||
return wrapper
|
||||
|
||||
|
||||
def filter_oauth_params(params):
|
||||
"""Removes all non oauth parameters from a dict or a list of params."""
|
||||
is_oauth = lambda kv: kv[0].startswith(u"oauth_")
|
||||
if isinstance(params, dict):
|
||||
return filter(is_oauth, params.items())
|
||||
else:
|
||||
return filter(is_oauth, params)
|
||||
|
||||
|
||||
def generate_timestamp():
|
||||
"""Get seconds since epoch (UTC).
|
||||
|
||||
Per `section 3.3`_ of the spec.
|
||||
|
||||
.. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3
|
||||
"""
|
||||
return unicode(int(time.time()))
|
||||
|
||||
|
||||
def generate_nonce():
|
||||
"""Generate pseudorandom nonce that is unlikely to repeat.
|
||||
|
||||
Per `section 3.3`_ of the spec.
|
||||
|
||||
A random 64-bit number is appended to the epoch timestamp for both
|
||||
randomness and to decrease the likelihood of collisions.
|
||||
|
||||
.. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3
|
||||
"""
|
||||
return unicode(getrandbits(64)) + generate_timestamp()
|
||||
|
||||
|
||||
def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET):
|
||||
"""Generates a generic OAuth token
|
||||
|
||||
According to `section 2`_ of the spec, the method of token
|
||||
construction is undefined. This implementation is simply a random selection
|
||||
of `length` choices from `chars`.
|
||||
|
||||
Credit to Ignacio Vazquez-Abrams for his excellent `Stackoverflow answer`_
|
||||
|
||||
.. _`Stackoverflow answer` : http://stackoverflow.com/questions/2257441/
|
||||
python-random-string-generation-with-upper-case-letters-and-digits
|
||||
|
||||
"""
|
||||
return u''.join(choice(chars) for x in range(length))
|
||||
|
||||
|
||||
def escape(u):
|
||||
"""Escape a unicode string in an OAuth-compatible fashion.
|
||||
|
||||
Per `section 3.6`_ of the spec.
|
||||
|
||||
.. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
|
||||
"""
|
||||
if not isinstance(u, unicode):
|
||||
raise ValueError('Only unicode objects are escapable.')
|
||||
# Letters, digits, and the characters '_.-' are already treated as safe
|
||||
# by urllib.quote(). We need to add '~' to fully support rfc5849.
|
||||
return quote(u, safe='~')
|
||||
|
||||
|
||||
def unescape(u):
|
||||
if not isinstance(u, unicode):
|
||||
raise ValueError('Only unicode objects are unescapable.')
|
||||
return unquote(u)
|
||||
|
||||
|
||||
def urlencode(query):
|
||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
||||
|
||||
Operates using an OAuth-safe escape() method, in contrast to urllib.urlencode.
|
||||
"""
|
||||
# Convert dictionaries to list of tuples
|
||||
if isinstance(query, dict):
|
||||
query = query.items()
|
||||
return u"&".join([u'='.join([escape(k), escape(v)]) for k, v in query])
|
||||
|
||||
|
||||
def parse_keqv_list(l):
|
||||
"""A unicode-safe version of urllib2.parse_keqv_list"""
|
||||
encoded_list = [u.encode('utf-8') for u in l]
|
||||
encoded_parsed = urllib2.parse_keqv_list(encoded_list)
|
||||
return dict((k.decode('utf-8'),
|
||||
v.decode('utf-8')) for k,v in encoded_parsed.items())
|
||||
|
||||
|
||||
def parse_http_list(u):
|
||||
"""A unicode-safe version of urllib2.parse_http_list"""
|
||||
encoded_str = u.encode('utf-8')
|
||||
encoded_list = urllib2.parse_http_list(encoded_str)
|
||||
return [s.decode('utf-8') for s in encoded_list]
|
||||
|
||||
|
||||
def parse_authorization_header(authorization_header):
|
||||
"""Parse an OAuth authorization header into a list of 2-tuples"""
|
||||
auth_scheme = u'OAuth '
|
||||
if authorization_header.startswith(auth_scheme):
|
||||
authorization_header = authorization_header.replace(auth_scheme, u'', 1)
|
||||
items = parse_http_list(authorization_header)
|
||||
try:
|
||||
return parse_keqv_list(items).items()
|
||||
except ValueError:
|
||||
raise ValueError('Malformed authorization header')
|
||||
|
||||
13
libs/oauthlib/oauth2/__init__.py
Normal file
13
libs/oauthlib/oauth2/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
"""
|
||||
oauthlib.oauth2
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module is a wrapper for the most recent implementation of OAuth 2.0 Client
|
||||
and Server classes.
|
||||
"""
|
||||
|
||||
from .draft25 import Client, Server
|
||||
|
||||
14
libs/oauthlib/oauth2/draft25/__init__.py
Normal file
14
libs/oauthlib/oauth2/draft25/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""
|
||||
oauthlib.oauth2.draft_25
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module is an implementation of various logic needed
|
||||
for signing and checking OAuth 2.0 draft 25 requests.
|
||||
"""
|
||||
|
||||
class Client(object):
|
||||
pass
|
||||
|
||||
class Server(object):
|
||||
pass
|
||||
|
||||
131
libs/oauthlib/oauth2/draft25/tokens.py
Normal file
131
libs/oauthlib/oauth2/draft25/tokens.py
Normal file
@@ -0,0 +1,131 @@
|
||||
from __future__ import absolute_import
|
||||
"""
|
||||
oauthlib.oauth2.draft25.tokens
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains methods for adding two types of access tokens to requests.
|
||||
|
||||
- Bearer http://tools.ietf.org/html/draft-ietf-oauth-saml2-bearer-08
|
||||
- MAC http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-00
|
||||
|
||||
"""
|
||||
from binascii import b2a_base64
|
||||
import hashlib
|
||||
import hmac
|
||||
from urlparse import urlparse
|
||||
|
||||
from . import utils
|
||||
|
||||
|
||||
def prepare_mac_header(token, uri, key, http_method, nonce=None, headers=None,
|
||||
body=None, ext=u'', hash_algorithm=u'hmac-sha-1'):
|
||||
"""Add an `MAC Access Authentication`_ signature to headers.
|
||||
|
||||
Unlike OAuth 1, this HMAC signature does not require inclusion of the request
|
||||
payload/body, neither does it use a combination of client_secret and
|
||||
token_secret but rather a mac_key provided together with the access token.
|
||||
|
||||
Currently two algorithms are supported, "hmac-sha-1" and "hmac-sha-256",
|
||||
`extension algorithms`_ are not supported.
|
||||
|
||||
Example MAC Authorization header, linebreaks added for clarity
|
||||
|
||||
Authorization: MAC id="h480djs93hd8",
|
||||
nonce="1336363200:dj83hs9s",
|
||||
mac="bhCQXTVyfj5cmA9uKkPFx1zeOXM="
|
||||
|
||||
.. _`MAC Access Authentication`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01
|
||||
.. _`extension algorithms`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-7.1
|
||||
|
||||
:param uri: Request URI.
|
||||
:param headers: Request headers as a dictionary.
|
||||
:param http_method: HTTP Request method.
|
||||
:param key: MAC given provided by token endpoint.
|
||||
:param algorithm: HMAC algorithm provided by token endpoint.
|
||||
:return: headers dictionary with the authorization field added.
|
||||
"""
|
||||
http_method = http_method.upper()
|
||||
host, port = utils.host_from_uri(uri)
|
||||
|
||||
if hash_algorithm.lower() == u'hmac-sha-1':
|
||||
h = hashlib.sha1
|
||||
else:
|
||||
h = hashlib.sha256
|
||||
|
||||
nonce = nonce or u'{0}:{1}'.format(utils.generate_nonce(), utils.generate_timestamp())
|
||||
sch, net, path, par, query, fra = urlparse(uri)
|
||||
|
||||
if query:
|
||||
request_uri = path + u'?' + query
|
||||
else:
|
||||
request_uri = path
|
||||
|
||||
# Hash the body/payload
|
||||
if body is not None:
|
||||
bodyhash = b2a_base64(h(body).digest())[:-1].decode('utf-8')
|
||||
else:
|
||||
bodyhash = u''
|
||||
|
||||
# Create the normalized base string
|
||||
base = []
|
||||
base.append(nonce)
|
||||
base.append(http_method.upper())
|
||||
base.append(request_uri)
|
||||
base.append(host)
|
||||
base.append(port)
|
||||
base.append(bodyhash)
|
||||
base.append(ext)
|
||||
base_string = '\n'.join(base) + u'\n'
|
||||
|
||||
# hmac struggles with unicode strings - http://bugs.python.org/issue5285
|
||||
if isinstance(key, unicode):
|
||||
key = key.encode('utf-8')
|
||||
sign = hmac.new(key, base_string, h)
|
||||
sign = b2a_base64(sign.digest())[:-1].decode('utf-8')
|
||||
|
||||
header = []
|
||||
header.append(u'MAC id="%s"' % token)
|
||||
header.append(u'nonce="%s"' % nonce)
|
||||
if bodyhash:
|
||||
header.append(u'bodyhash="%s"' % bodyhash)
|
||||
if ext:
|
||||
header.append(u'ext="%s"' % ext)
|
||||
header.append(u'mac="%s"' % sign)
|
||||
|
||||
headers = headers or {}
|
||||
headers[u'Authorization'] = u', '.join(header)
|
||||
return headers
|
||||
|
||||
|
||||
def prepare_bearer_uri(token, uri):
|
||||
"""Add a `Bearer Token`_ to the request URI.
|
||||
Not recommended, use only if client can't use authorization header or body.
|
||||
|
||||
http://www.example.com/path?access_token=h480djs93hd8
|
||||
|
||||
.. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18
|
||||
"""
|
||||
return utils.add_params_to_uri(uri, [((u'access_token', token))])
|
||||
|
||||
|
||||
def prepare_bearer_headers(token, headers=None):
|
||||
"""Add a `Bearer Token`_ to the request URI.
|
||||
Recommended method of passing bearer tokens.
|
||||
|
||||
Authorization: Bearer h480djs93hd8
|
||||
|
||||
.. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18
|
||||
"""
|
||||
headers = headers or {}
|
||||
headers[u'Authorization'] = u'Bearer %s' % token
|
||||
return headers
|
||||
|
||||
|
||||
def prepare_bearer_body(token, body=u''):
|
||||
"""Add a `Bearer Token`_ to the request body.
|
||||
|
||||
access_token=h480djs93hd8
|
||||
|
||||
.. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18
|
||||
"""
|
||||
return utils.add_params_to_qs(body, [((u'access_token', token))])
|
||||
128
libs/oauthlib/oauth2/draft25/utils.py
Normal file
128
libs/oauthlib/oauth2/draft25/utils.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
oauthlib.utils
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
This module contains utility methods used by various parts of the OAuth 2 spec.
|
||||
"""
|
||||
|
||||
import random
|
||||
import string
|
||||
import time
|
||||
import urllib
|
||||
from urlparse import urlparse, urlunparse, parse_qsl
|
||||
|
||||
UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') +
|
||||
string.digits.decode('ascii'))
|
||||
|
||||
def add_params_to_qs(query, params):
|
||||
"""Extend a query with a list of two-tuples.
|
||||
|
||||
:param query: Query string.
|
||||
:param params: List of two-tuples.
|
||||
:return: extended query
|
||||
"""
|
||||
queryparams = parse_qsl(query, keep_blank_values=True)
|
||||
queryparams.extend(params)
|
||||
return urlencode(queryparams)
|
||||
|
||||
|
||||
def add_params_to_uri(uri, params):
|
||||
"""Add a list of two-tuples to the uri query components.
|
||||
|
||||
:param uri: Full URI.
|
||||
:param params: List of two-tuples.
|
||||
:return: uri with extended query
|
||||
"""
|
||||
sch, net, path, par, query, fra = urlparse(uri)
|
||||
query = add_params_to_qs(query, params)
|
||||
return urlunparse((sch, net, path, par, query, fra))
|
||||
|
||||
|
||||
def escape(u):
|
||||
"""Escape a string in an OAuth-compatible fashion.
|
||||
|
||||
Per `section 3.6`_ of the spec.
|
||||
|
||||
.. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
|
||||
|
||||
"""
|
||||
if not isinstance(u, unicode):
|
||||
raise ValueError('Only unicode objects are escapable.')
|
||||
return urllib.quote(u.encode('utf-8'), safe='~')
|
||||
|
||||
|
||||
def generate_nonce():
|
||||
"""Generate pseudorandom nonce that is unlikely to repeat.
|
||||
|
||||
Per `section 3.2.1`_ of the MAC Access Authentication spec.
|
||||
|
||||
A random 64-bit number is appended to the epoch timestamp for both
|
||||
randomness and to decrease the likelihood of collisions.
|
||||
|
||||
.. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1
|
||||
"""
|
||||
return unicode(unicode(random.getrandbits(64)) + generate_timestamp())
|
||||
|
||||
|
||||
def generate_timestamp():
|
||||
"""Get seconds since epoch (UTC).
|
||||
|
||||
Per `section 3.2.1`_ of the MAC Access Authentication spec.
|
||||
|
||||
.. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1
|
||||
"""
|
||||
return unicode(int(time.time()))
|
||||
|
||||
|
||||
def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET):
|
||||
"""Generates a generic OAuth 2 token
|
||||
|
||||
According to `section 1.4`_ and `section 1.5` of the spec, the method of token
|
||||
construction is undefined. This implementation is simply a random selection
|
||||
of `length` choices from `chars`. SystemRandom is used since it provides
|
||||
higher entropy than random.choice.
|
||||
|
||||
.. _`section 1.4`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.4
|
||||
.. _`section 1.5`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.5
|
||||
"""
|
||||
rand = random.SystemRandom()
|
||||
return u''.join(rand.choice(chars) for x in range(length))
|
||||
|
||||
|
||||
def host_from_uri(uri):
|
||||
"""Extract hostname and port from URI.
|
||||
|
||||
Will use default port for HTTP and HTTPS if none is present in the URI.
|
||||
|
||||
>>> host_from_uri(u'https://www.example.com/path?query')
|
||||
u'www.example.com', u'443'
|
||||
>>> host_from_uri(u'http://www.example.com:8080/path?query')
|
||||
u'www.example.com', u'8080'
|
||||
|
||||
:param uri: Full URI.
|
||||
:param http_method: HTTP request method.
|
||||
:return: hostname, port
|
||||
"""
|
||||
default_ports = {
|
||||
u'HTTP' : u'80',
|
||||
u'HTTPS' : u'443',
|
||||
}
|
||||
|
||||
sch, netloc, path, par, query, fra = urlparse(uri)
|
||||
if u':' in netloc:
|
||||
netloc, port = netloc.split(u':', 1)
|
||||
else:
|
||||
port = default_ports.get(sch.upper())
|
||||
|
||||
return netloc, port
|
||||
|
||||
|
||||
def urlencode(query):
|
||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
||||
|
||||
Operates using an OAuth-safe escape() method, in contrast to urllib.urlenocde.
|
||||
"""
|
||||
# Convert dictionaries to list of tuples
|
||||
if isinstance(query, dict):
|
||||
query = query.items()
|
||||
return "&".join(['='.join([escape(k), escape(v)]) for k, v in query])
|
||||
@@ -18,7 +18,8 @@
|
||||
from .core import (SERVICES, LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE,
|
||||
MATCHING_CONFIDENCE, create_list_tasks, consume_task, create_download_tasks,
|
||||
group_by_video, key_subtitles)
|
||||
from .languages import list_languages
|
||||
import guessit
|
||||
from guessit.language import ALL_LANGUAGES
|
||||
import logging
|
||||
|
||||
|
||||
@@ -26,7 +27,7 @@ __all__ = ['list_subtitles', 'download_subtitles']
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def list_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3):
|
||||
def list_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None):
|
||||
"""List subtitles in given paths according to the criteria
|
||||
|
||||
:param paths: path(s) to video file or folder
|
||||
@@ -37,19 +38,20 @@ def list_subtitles(paths, languages=None, services=None, force=True, multi=False
|
||||
:param bool multi: search multiple languages for the same video
|
||||
:param string cache_dir: path to the cache directory to use
|
||||
:param int max_depth: maximum depth for scanning entries
|
||||
:param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``)
|
||||
:return: found subtitles
|
||||
:rtype: dict of :class:`~subliminal.videos.Video` => [:class:`~subliminal.subtitles.ResultSubtitle`]
|
||||
|
||||
"""
|
||||
services = services or SERVICES
|
||||
languages = set(languages or list_languages(1))
|
||||
languages = set(map(guessit.Language, languages or []) or ALL_LANGUAGES)
|
||||
if isinstance(paths, basestring):
|
||||
paths = [paths]
|
||||
if any([not isinstance(p, unicode) for p in paths]):
|
||||
logger.warning(u'Not all entries are unicode')
|
||||
results = []
|
||||
service_instances = {}
|
||||
tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth)
|
||||
tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter)
|
||||
for task in tasks:
|
||||
try:
|
||||
result = consume_task(task, service_instances)
|
||||
@@ -61,7 +63,7 @@ def list_subtitles(paths, languages=None, services=None, force=True, multi=False
|
||||
return group_by_video(results)
|
||||
|
||||
|
||||
def download_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, order=None):
|
||||
def download_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None, order=None):
|
||||
"""Download subtitles in given paths according to the criteria
|
||||
|
||||
:param paths: path(s) to video file or folder
|
||||
@@ -72,6 +74,7 @@ def download_subtitles(paths, languages=None, services=None, force=True, multi=F
|
||||
:param bool multi: search multiple languages for the same video
|
||||
:param string cache_dir: path to the cache directory to use
|
||||
:param int max_depth: maximum depth for scanning entries
|
||||
:param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``)
|
||||
:param order: preferred order for subtitles sorting
|
||||
:type list: list of :data:`~subliminal.core.LANGUAGE_INDEX`, :data:`~subliminal.core.SERVICE_INDEX`, :data:`~subliminal.core.SERVICE_CONFIDENCE`, :data:`~subliminal.core.MATCHING_CONFIDENCE`
|
||||
:return: found subtitles
|
||||
@@ -79,11 +82,11 @@ def download_subtitles(paths, languages=None, services=None, force=True, multi=F
|
||||
|
||||
"""
|
||||
services = services or SERVICES
|
||||
languages = languages or list_languages(1)
|
||||
languages = map(guessit.Language, languages or []) or list(ALL_LANGUAGES)
|
||||
if isinstance(paths, basestring):
|
||||
paths = [paths]
|
||||
order = order or [LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE]
|
||||
subtitles_by_video = list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth)
|
||||
subtitles_by_video = list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth, scan_filter)
|
||||
for video, subtitles in subtitles_by_video.iteritems():
|
||||
subtitles.sort(key=lambda s: key_subtitles(s, video, languages, services, order), reverse=True)
|
||||
results = []
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
from .core import (consume_task, LANGUAGE_INDEX, SERVICE_INDEX,
|
||||
SERVICE_CONFIDENCE, MATCHING_CONFIDENCE, SERVICES, create_list_tasks,
|
||||
create_download_tasks, group_by_video, key_subtitles)
|
||||
from .languages import list_languages
|
||||
from guessit.language import ALL_LANGUAGES
|
||||
from .tasks import StopTask
|
||||
import Queue
|
||||
import logging
|
||||
@@ -108,29 +108,29 @@ class Pool(object):
|
||||
break
|
||||
return results
|
||||
|
||||
def list_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3):
|
||||
def list_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None):
|
||||
"""See :meth:`subliminal.list_subtitles`"""
|
||||
services = services or SERVICES
|
||||
languages = set(languages or list_languages(1))
|
||||
languages = set(languages or ALL_LANGUAGES)
|
||||
if isinstance(paths, basestring):
|
||||
paths = [paths]
|
||||
if any([not isinstance(p, unicode) for p in paths]):
|
||||
logger.warning(u'Not all entries are unicode')
|
||||
tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth)
|
||||
tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter)
|
||||
for task in tasks:
|
||||
self.tasks.put(task)
|
||||
self.join()
|
||||
results = self.collect()
|
||||
return group_by_video(results)
|
||||
|
||||
def download_subtitles(self, paths, languages=None, services=None, cache_dir=None, max_depth=3, force=True, multi=False, order=None):
|
||||
def download_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None, order=None):
|
||||
"""See :meth:`subliminal.download_subtitles`"""
|
||||
services = services or SERVICES
|
||||
languages = languages or list_languages(1)
|
||||
languages = languages or list(ALL_LANGUAGES)
|
||||
if isinstance(paths, basestring):
|
||||
paths = [paths]
|
||||
order = order or [LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE]
|
||||
subtitles_by_video = self.list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth)
|
||||
subtitles_by_video = self.list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth, scan_filter)
|
||||
for video, subtitles in subtitles_by_video.iteritems():
|
||||
subtitles.sort(key=lambda s: key_subtitles(s, video, languages, services, order), reverse=True)
|
||||
tasks = create_download_tasks(subtitles_by_video, multi)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user