From aa2e302936af3c6f0a844abaebb8533fb3280c40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20V=C3=A9zina?= Date: Fri, 18 Mar 2016 12:04:27 -0400 Subject: [PATCH] Update feedparser.py 5.1.3 -> 5.2.1 --- gluon/contrib/feedparser.py | 1492 +++++++++++++++++------------------ 1 file changed, 743 insertions(+), 749 deletions(-) diff --git a/gluon/contrib/feedparser.py b/gluon/contrib/feedparser.py index c78e6a39..999cb0df 100644 --- a/gluon/contrib/feedparser.py +++ b/gluon/contrib/feedparser.py @@ -9,10 +9,10 @@ Required: Python 2.4 or later Recommended: iconv_codec """ -__version__ = "5.1.3" +__version__ = "5.2.1" __license__ = """ -Copyright (c) 2010-2012 Kurt McKee -Copyright (c) 2002-2008 Mark Pilgrim +Copyright 2010-2015 Kurt McKee +Copyright 2002-2008 Mark Pilgrim All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -61,15 +61,6 @@ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,ap # of pre-installed parsers until it finds one that supports everything we need. PREFERRED_XML_PARSERS = ["drv_libxml2"] -# If you want feedparser to automatically run HTML markup through HTML Tidy, set -# this to 1. Requires mxTidy -# or utidylib . -TIDY_MARKUP = 0 - -# List of Python interfaces for HTML Tidy, in order of preference. Only useful -# if TIDY_MARKUP = 1 -PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] - # If you want feedparser to automatically resolve all relative URIs, set this # to 1. RESOLVE_RELATIVE_URIS = 1 @@ -78,10 +69,6 @@ RESOLVE_RELATIVE_URIS = 1 # HTML content, set this to 1. SANITIZE_HTML = 1 -# If you want feedparser to automatically parse microformat content embedded -# in entry contents, set this to 1 -PARSE_MICROFORMATS = 1 - # ---------- Python 3 modules (make it work if possible) ---------- try: import rfc822 @@ -147,6 +134,7 @@ import cgi import codecs import copy import datetime +import itertools import re import struct import time @@ -203,8 +191,7 @@ else: _XML_AVAILABLE = 1 # sgmllib is not available by default in Python 3; if the end user doesn't have -# it available then we'll lose illformed XML parsing, content santizing, and -# microformat support (at least while feedparser depends on BeautifulSoup). +# it available then we'll lose illformed XML parsing and content santizing try: import sgmllib except ImportError: @@ -276,15 +263,6 @@ try: except ImportError: chardet = None -# BeautifulSoup is used to extract microformat content from HTML -# feedparser is tested using BeautifulSoup 3.2.0 -# http://www.crummy.com/software/BeautifulSoup/ -try: - import BeautifulSoup -except ImportError: - BeautifulSoup = None - PARSE_MICROFORMATS = False - # ---------- don't touch these ---------- class ThingsNobodyCaresAboutButMe(Exception): pass class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass @@ -328,6 +306,9 @@ class FeedParserDict(dict): 'tagline': 'subtitle', 'tagline_detail': 'subtitle_detail'} def __getitem__(self, key): + ''' + :return: A :class:`FeedParserDict`. + ''' if key == 'category': try: return dict.__getitem__(self, 'tags')[0]['term'] @@ -390,6 +371,9 @@ class FeedParserDict(dict): has_key = __contains__ def get(self, key, default=None): + ''' + :return: A :class:`FeedParserDict`. + ''' try: return self.__getitem__(key) except KeyError: @@ -451,16 +435,15 @@ _cp1252 = { _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub(r'\1\3', uri) - #try: if not isinstance(uri, unicode): uri = uri.decode('utf-8', 'ignore') - uri = urlparse.urljoin(base, uri) + try: + uri = urlparse.urljoin(base, uri) + except ValueError: + uri = u'' if not isinstance(uri, unicode): return uri.decode('utf-8', 'ignore') return uri - #except: - # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) - # return urlparse.urljoin(base, uri) class _FeedParserMixin: namespaces = { @@ -496,6 +479,8 @@ class _FeedParserMixin: 'http://freshmeat.net/rss/fm/': 'fm', 'http://xmlns.com/foaf/0.1/': 'foaf', 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', + 'http://www.georss.org/georss': 'georss', + 'http://www.opengis.net/gml': 'gml', 'http://postneo.com/icbm/': 'icbm', 'http://purl.org/rss/1.0/modules/image/': 'image', 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', @@ -527,6 +512,7 @@ class _FeedParserMixin: 'http://www.w3.org/1999/xhtml': 'xhtml', 'http://www.w3.org/1999/xlink': 'xlink', 'http://www.w3.org/XML/1998/namespace': 'xml', + 'http://podlove.org/simple-chapters': 'psc', } _matchnamespaces = {} @@ -556,6 +542,10 @@ class _FeedParserMixin: self.incontributor = 0 self.inpublisher = 0 self.insource = 0 + + # georss + self.ingeometry = 0 + self.sourcedata = FeedParserDict() self.contentparams = FeedParserDict() self._summaryKey = None @@ -568,6 +558,11 @@ class _FeedParserMixin: self.svgOK = 0 self.title_depth = -1 self.depth = 0 + # psc_chapters_flag prevents multiple psc_chapters from being + # captured in a single entry or item. The transition states are + # None -> True -> False. psc_chapter elements will only be + # captured while it is True. + self.psc_chapters_flag = None if baselang: self.feeddata['language'] = baselang.replace('_','-') @@ -892,7 +887,9 @@ class _FeedParserMixin: # resolve relative URIs if (element in self.can_be_relative_uri) and output: - output = self.resolveURI(output) + # do not resolve guid elements with isPermalink="false" + if not element == 'id' or self.guidislink: + output = self.resolveURI(output) # decode entities within embedded markup if not self.contentparams.get('base64', 0): @@ -920,22 +917,6 @@ class _FeedParserMixin: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html')) - # parse microformats - # (must do this before sanitizing because some microformats - # rely on elements that we sanitize) - if PARSE_MICROFORMATS and is_htmlish and element in ['content', 'description', 'summary']: - mfresults = _parseMicroformats(output, self.baseuri, self.encoding) - if mfresults: - for tag in mfresults.get('tags', []): - self._addTag(tag['term'], tag['scheme'], tag['label']) - for enclosure in mfresults.get('enclosures', []): - self._start_enclosure(enclosure) - for xfn in mfresults.get('xfn', []): - self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) - vcard = mfresults.get('vcard') - if vcard: - self._getContext()['vcard'] = vcard - # sanitize embedded markup if is_htmlish and SANITIZE_HTML: if element in self.can_contain_dangerous_markup: @@ -956,8 +937,8 @@ class _FeedParserMixin: if isinstance(output, unicode): output = output.translate(_cp1252) - # categories/tags/keywords/whatever are handled in _end_category - if element == 'category': + # categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords + if element in ('category', 'tags', 'itunes_keywords'): return output if element == 'title' and -1 < self.title_depth <= self.depth: @@ -975,6 +956,7 @@ class _FeedParserMixin: # query variables in urls in link elements are improperly # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're # unhandled character references. fix this special case. + output = output.replace('&', '&') output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) self.entries[-1][element] = output if output: @@ -1313,7 +1295,7 @@ class _FeedParserMixin: def _sync_author_detail(self, key='author'): context = self._getContext() - detail = context.get('%s_detail' % key) + detail = context.get('%ss' % key, [FeedParserDict()])[-1] if detail: name = detail.get('name') email = detail.get('email') @@ -1342,11 +1324,11 @@ class _FeedParserMixin: author = author[:-1] author = author.strip() if author or email: - context.setdefault('%s_detail' % key, FeedParserDict()) + context.setdefault('%s_detail' % key, detail) if author: - context['%s_detail' % key]['name'] = author + detail['name'] = author if email: - context['%s_detail' % key]['email'] = email + detail['email'] = email def _start_subtitle(self, attrsD): self.pushContent('subtitle', attrsD, u'text/plain', 1) @@ -1374,6 +1356,7 @@ class _FeedParserMixin: self.inentry = 1 self.guidislink = 0 self.title_depth = -1 + self.psc_chapters_flag = None id = self._getAttribute(attrsD, 'rdf:about') if id: context = self._getContext() @@ -1403,6 +1386,20 @@ class _FeedParserMixin: self._sync_author_detail('publisher') _end_webmaster = _end_dc_publisher + def _start_dcterms_valid(self, attrsD): + self.push('validity', 1) + + def _end_dcterms_valid(self): + for validity_detail in self.pop('validity').split(';'): + if '=' in validity_detail: + key, value = validity_detail.split('=', 1) + if key == 'start': + self._save('validity_start', value, overwrite=True) + self._save('validity_start_parsed', _parse_date(value), overwrite=True) + elif key == 'end': + self._save('validity_end', value, overwrite=True) + self._save('validity_end_parsed', _parse_date(value), overwrite=True) + def _start_published(self, attrsD): self.push('published', 1) _start_dcterms_issued = _start_published @@ -1447,6 +1444,128 @@ class _FeedParserMixin: def _end_expirationdate(self): self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) + # geospatial location, or "where", from georss.org + + def _start_georssgeom(self, attrsD): + self.push('geometry', 0) + context = self._getContext() + context['where'] = FeedParserDict() + + _start_georss_point = _start_georssgeom + _start_georss_line = _start_georssgeom + _start_georss_polygon = _start_georssgeom + _start_georss_box = _start_georssgeom + + def _save_where(self, geometry): + context = self._getContext() + context['where'].update(geometry) + + def _end_georss_point(self): + geometry = _parse_georss_point(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _end_georss_line(self): + geometry = _parse_georss_line(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _end_georss_polygon(self): + this = self.pop('geometry') + geometry = _parse_georss_polygon(this) + if geometry: + self._save_where(geometry) + + def _end_georss_box(self): + geometry = _parse_georss_box(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _start_where(self, attrsD): + self.push('where', 0) + context = self._getContext() + context['where'] = FeedParserDict() + _start_georss_where = _start_where + + def _parse_srs_attrs(self, attrsD): + srsName = attrsD.get('srsname') + try: + srsDimension = int(attrsD.get('srsdimension', '2')) + except ValueError: + srsDimension = 2 + context = self._getContext() + context['where']['srsName'] = srsName + context['where']['srsDimension'] = srsDimension + + def _start_gml_point(self, attrsD): + self._parse_srs_attrs(attrsD) + self.ingeometry = 1 + self.push('geometry', 0) + + def _start_gml_linestring(self, attrsD): + self._parse_srs_attrs(attrsD) + self.ingeometry = 'linestring' + self.push('geometry', 0) + + def _start_gml_polygon(self, attrsD): + self._parse_srs_attrs(attrsD) + self.push('geometry', 0) + + def _start_gml_exterior(self, attrsD): + self.push('geometry', 0) + + def _start_gml_linearring(self, attrsD): + self.ingeometry = 'polygon' + self.push('geometry', 0) + + def _start_gml_pos(self, attrsD): + self.push('pos', 0) + + def _end_gml_pos(self): + this = self.pop('pos') + context = self._getContext() + srsName = context['where'].get('srsName') + srsDimension = context['where'].get('srsDimension', 2) + swap = True + if srsName and "EPSG" in srsName: + epsg = int(srsName.split(":")[-1]) + swap = bool(epsg in _geogCS) + geometry = _parse_georss_point(this, swap=swap, dims=srsDimension) + if geometry: + self._save_where(geometry) + + def _start_gml_poslist(self, attrsD): + self.push('pos', 0) + + def _end_gml_poslist(self): + this = self.pop('pos') + context = self._getContext() + srsName = context['where'].get('srsName') + srsDimension = context['where'].get('srsDimension', 2) + swap = True + if srsName and "EPSG" in srsName: + epsg = int(srsName.split(":")[-1]) + swap = bool(epsg in _geogCS) + geometry = _parse_poslist( + this, self.ingeometry, swap=swap, dims=srsDimension) + if geometry: + self._save_where(geometry) + + def _end_geom(self): + self.ingeometry = 0 + self.pop('geometry') + _end_gml_point = _end_geom + _end_gml_linestring = _end_geom + _end_gml_linearring = _end_geom + _end_gml_exterior = _end_geom + _end_gml_polygon = _end_geom + + def _end_where(self): + self.pop('where') + _end_georss_where = _end_where + + # end geospatial + def _start_cc_license(self, attrsD): context = self._getContext() value = self._getAttribute(attrsD, 'rdf:resource') @@ -1471,22 +1590,25 @@ class _FeedParserMixin: del context['license'] _end_creativeCommons_license = _end_creativecommons_license - def _addXFN(self, relationships, href, name): - context = self._getContext() - xfn = context.setdefault('xfn', []) - value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) - if value not in xfn: - xfn.append(value) - def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) if (not term) and (not scheme) and (not label): return - value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) + value = FeedParserDict(term=term, scheme=scheme, label=label) if value not in tags: tags.append(value) + def _start_tags(self, attrsD): + # This is a completely-made up element. Its semantics are determined + # only by a single feed that precipitated bug report 392 on Google Code. + # In short, this is junk code. + self.push('tags', 1) + + def _end_tags(self): + for term in self.pop('tags').split(','): + self._addTag(term.strip(), None, None) + def _start_category(self, attrsD): term = attrsD.get('term') scheme = attrsD.get('scheme', attrsD.get('domain')) @@ -1505,6 +1627,11 @@ class _FeedParserMixin: if term.strip(): self._addTag(term.strip(), u'http://www.itunes.com/', None) + def _end_media_keywords(self): + for term in self.pop('media_keywords').split(','): + if term.strip(): + self._addTag(term.strip(), None, None) + def _start_itunes_category(self, attrsD): self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None) self.push('category', 1) @@ -1594,6 +1721,7 @@ class _FeedParserMixin: else: self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource) _start_dc_description = _start_description + _start_media_description = _start_description def _start_abstract(self, attrsD): self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) @@ -1606,6 +1734,7 @@ class _FeedParserMixin: self._summaryKey = None _end_abstract = _end_description _end_dc_description = _end_description + _end_media_description = _end_description def _start_info(self, attrsD): self.pushContent('info', attrsD, u'text/plain', 1) @@ -1729,6 +1858,55 @@ class _FeedParserMixin: # by applications that only need to know if the content is explicit. self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] + def _start_media_group(self, attrsD): + # don't do anything, but don't break the enclosed tags either + pass + + def _start_media_rating(self, attrsD): + context = self._getContext() + context.setdefault('media_rating', attrsD) + self.push('rating', 1) + + def _end_media_rating(self): + rating = self.pop('rating') + if rating is not None and rating.strip(): + context = self._getContext() + context['media_rating']['content'] = rating + + def _start_media_credit(self, attrsD): + context = self._getContext() + context.setdefault('media_credit', []) + context['media_credit'].append(attrsD) + self.push('credit', 1) + + def _end_media_credit(self): + credit = self.pop('credit') + if credit != None and len(credit.strip()) != 0: + context = self._getContext() + context['media_credit'][-1]['content'] = credit + + def _start_media_restriction(self, attrsD): + context = self._getContext() + context.setdefault('media_restriction', attrsD) + self.push('restriction', 1) + + def _end_media_restriction(self): + restriction = self.pop('restriction') + if restriction != None and len(restriction.strip()) != 0: + context = self._getContext() + context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')] + + def _start_media_license(self, attrsD): + context = self._getContext() + context.setdefault('media_license', attrsD) + self.push('license', 1) + + def _end_media_license(self): + license = self.pop('license') + if license != None and len(license.strip()) != 0: + context = self._getContext() + context['media_license']['content'] = license + def _start_media_content(self, attrsD): context = self._getContext() context.setdefault('media_content', []) @@ -1767,6 +1945,26 @@ class _FeedParserMixin: return context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) + def _start_psc_chapters(self, attrsD): + if self.psc_chapters_flag is None: + # Transition from None -> True + self.psc_chapters_flag = True + attrsD['chapters'] = [] + self._getContext()['psc_chapters'] = FeedParserDict(attrsD) + + def _end_psc_chapters(self): + # Transition from True -> False + self.psc_chapters_flag = False + + def _start_psc_chapter(self, attrsD): + if self.psc_chapters_flag: + start = self._getAttribute(attrsD, 'start') + attrsD['start_parsed'] = _parse_psc_chapter_start(start) + + context = self._getContext()['psc_chapters'] + context['chapters'].append(FeedParserDict(attrsD)) + + if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): @@ -1830,6 +2028,7 @@ if _XML_AVAILABLE: attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) + localname = str(localname).lower() self.unknown_starttag(localname, attrsD.items()) def characters(self, text): @@ -2076,455 +2275,18 @@ class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace(''', "'") + data = data.replace('/', '/') + data = data.replace('/', '/') return data def strattrs(self, attrs): return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) -class _MicroformatsParser: - STRING = 1 - DATE = 2 - URI = 3 - NODE = 4 - EMAIL = 5 - - known_xfn_relationships = set(['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']) - known_binary_extensions = set(['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']) - - def __init__(self, data, baseuri, encoding): - self.document = BeautifulSoup.BeautifulSoup(data) - self.baseuri = baseuri - self.encoding = encoding - if isinstance(data, unicode): - data = data.encode(encoding) - self.tags = [] - self.enclosures = [] - self.xfn = [] - self.vcard = None - - def vcardEscape(self, s): - if isinstance(s, basestring): - s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') - return s - - def vcardFold(self, s): - s = re.sub(';+$', '', s) - sFolded = '' - iMax = 75 - sPrefix = '' - while len(s) > iMax: - sFolded += sPrefix + s[:iMax] + '\n' - s = s[iMax:] - sPrefix = ' ' - iMax = 74 - sFolded += sPrefix + s - return sFolded - - def normalize(self, s): - return re.sub(r'\s+', ' ', s).strip() - - def unique(self, aList): - results = [] - for element in aList: - if element not in results: - results.append(element) - return results - - def toISO8601(self, dt): - return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) - - def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): - all = lambda x: 1 - sProperty = sProperty.lower() - bFound = 0 - bNormalize = 1 - propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)} - if bAllowMultiple and (iPropertyType != self.NODE): - snapResults = [] - containers = elmRoot(['ul', 'ol'], propertyMatch) - for container in containers: - snapResults.extend(container('li')) - bFound = (len(snapResults) != 0) - if not bFound: - snapResults = elmRoot(all, propertyMatch) - bFound = (len(snapResults) != 0) - if (not bFound) and (sProperty == 'value'): - snapResults = elmRoot('pre') - bFound = (len(snapResults) != 0) - bNormalize = not bFound - if not bFound: - snapResults = [elmRoot] - bFound = (len(snapResults) != 0) - arFilter = [] - if sProperty == 'vcard': - snapFilter = elmRoot(all, propertyMatch) - for node in snapFilter: - if node.findParent(all, propertyMatch): - arFilter.append(node) - arResults = [] - for node in snapResults: - if node not in arFilter: - arResults.append(node) - bFound = (len(arResults) != 0) - if not bFound: - if bAllowMultiple: - return [] - elif iPropertyType == self.STRING: - return '' - elif iPropertyType == self.DATE: - return None - elif iPropertyType == self.URI: - return '' - elif iPropertyType == self.NODE: - return None - else: - return None - arValues = [] - for elmResult in arResults: - sValue = None - if iPropertyType == self.NODE: - if bAllowMultiple: - arValues.append(elmResult) - continue - else: - return elmResult - sNodeName = elmResult.name.lower() - if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): - sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0] - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (sNodeName == 'abbr'): - sValue = elmResult.get('title') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (iPropertyType == self.URI): - if sNodeName == 'a': - sValue = elmResult.get('href') - elif sNodeName == 'img': - sValue = elmResult.get('src') - elif sNodeName == 'object': - sValue = elmResult.get('data') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (sNodeName == 'img'): - sValue = elmResult.get('alt') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if not sValue: - sValue = elmResult.renderContents() - sValue = re.sub(r'<\S[^>]*>', '', sValue) - sValue = sValue.replace('\r\n', '\n') - sValue = sValue.replace('\r', '\n') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if not sValue: - continue - if iPropertyType == self.DATE: - sValue = _parse_date_iso8601(sValue) - if bAllowMultiple: - arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) - else: - return bAutoEscape and self.vcardEscape(sValue) or sValue - return arValues - - def findVCards(self, elmRoot, bAgentParsing=0): - sVCards = '' - - if not bAgentParsing: - arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) - else: - arCards = [elmRoot] - - for elmCard in arCards: - arLines = [] - - def processSingleString(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) - if sValue: - arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) - return sValue or u'' - - def processSingleURI(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.URI) - if sValue: - sContentType = '' - sEncoding = '' - sValueKey = '' - if sValue.startswith('data:'): - sEncoding = ';ENCODING=b' - sContentType = sValue.split(';')[0].split('/').pop() - sValue = sValue.split(',', 1).pop() - else: - elmValue = self.getPropertyValue(elmCard, sProperty) - if elmValue: - if sProperty != 'url': - sValueKey = ';VALUE=uri' - sContentType = elmValue.get('type', '').strip().split('/').pop().strip() - sContentType = sContentType.upper() - if sContentType == 'OCTET-STREAM': - sContentType = '' - if sContentType: - sContentType = ';TYPE=' + sContentType.upper() - arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) - - def processTypeValue(sProperty, arDefaultType, arForceType=None): - arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) - for elmResult in arResults: - arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) - if arForceType: - arType = self.unique(arForceType + arType) - if not arType: - arType = arDefaultType - sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) - if sValue: - arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) - - # AGENT - # must do this before all other properties because it is destructive - # (removes nested class="vcard" nodes so they don't interfere with - # this vcard's other properties) - arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) - for elmAgent in arAgent: - if re.compile(r'\bvcard\b').search(elmAgent.get('class')): - sAgentValue = self.findVCards(elmAgent, 1) + '\n' - sAgentValue = sAgentValue.replace('\n', '\\n') - sAgentValue = sAgentValue.replace(';', '\\;') - if sAgentValue: - arLines.append(self.vcardFold('AGENT:' + sAgentValue)) - # Completely remove the agent element from the parse tree - elmAgent.extract() - else: - sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); - if sAgentValue: - arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) - - # FN (full name) - sFN = processSingleString('fn') - - # N (name) - elmName = self.getPropertyValue(elmCard, 'n') - if elmName: - sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) - sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) - arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) - arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) - arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) - arLines.append(self.vcardFold('N:' + sFamilyName + ';' + - sGivenName + ';' + - ','.join(arAdditionalNames) + ';' + - ','.join(arHonorificPrefixes) + ';' + - ','.join(arHonorificSuffixes))) - elif sFN: - # implied "N" optimization - # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization - arNames = self.normalize(sFN).split() - if len(arNames) == 2: - bFamilyNameFirst = (arNames[0].endswith(',') or - len(arNames[1]) == 1 or - ((len(arNames[1]) == 2) and (arNames[1].endswith('.')))) - if bFamilyNameFirst: - arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) - else: - arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) - - # SORT-STRING - sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) - if sSortString: - arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) - - # NICKNAME - arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) - if arNickname: - arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) - - # PHOTO - processSingleURI('photo') - - # BDAY - dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) - if dtBday: - arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) - - # ADR (address) - arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) - for elmAdr in arAdr: - arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) - if not arType: - arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 - sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) - sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) - sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) - sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) - sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) - sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) - sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) - arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' + - sPostOfficeBox + ';' + - sExtendedAddress + ';' + - sStreetAddress + ';' + - sLocality + ';' + - sRegion + ';' + - sPostalCode + ';' + - sCountryName)) - - # LABEL - processTypeValue('label', ['intl','postal','parcel','work']) - - # TEL (phone number) - processTypeValue('tel', ['voice']) - - # EMAIL - processTypeValue('email', ['internet'], ['internet']) - - # MAILER - processSingleString('mailer') - - # TZ (timezone) - processSingleString('tz') - - # GEO (geographical information) - elmGeo = self.getPropertyValue(elmCard, 'geo') - if elmGeo: - sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) - sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) - arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) - - # TITLE - processSingleString('title') - - # ROLE - processSingleString('role') - - # LOGO - processSingleURI('logo') - - # ORG (organization) - elmOrg = self.getPropertyValue(elmCard, 'org') - if elmOrg: - sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) - if not sOrganizationName: - # implied "organization-name" optimization - # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization - sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) - if sOrganizationName: - arLines.append(self.vcardFold('ORG:' + sOrganizationName)) - else: - arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) - arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) - - # CATEGORY - arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) - if arCategory: - arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) - - # NOTE - processSingleString('note') - - # REV - processSingleString('rev') - - # SOUND - processSingleURI('sound') - - # UID - processSingleString('uid') - - # URL - processSingleURI('url') - - # CLASS - processSingleString('class') - - # KEY - processSingleURI('key') - - if arLines: - arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] - # XXX - this is super ugly; properly fix this with issue 148 - for i, s in enumerate(arLines): - if not isinstance(s, unicode): - arLines[i] = s.decode('utf-8', 'ignore') - sVCards += u'\n'.join(arLines) + u'\n' - - return sVCards.strip() - - def isProbablyDownloadable(self, elm): - attrsD = elm.attrMap - if 'href' not in attrsD: - return 0 - linktype = attrsD.get('type', '').strip() - if linktype.startswith('audio/') or \ - linktype.startswith('video/') or \ - (linktype.startswith('application/') and not linktype.endswith('xml')): - return 1 - try: - path = urlparse.urlparse(attrsD['href'])[2] - except ValueError: - return 0 - if path.find('.') == -1: - return 0 - fileext = path.split('.').pop().lower() - return fileext in self.known_binary_extensions - - def findTags(self): - all = lambda x: 1 - for elm in self.document(all, {'rel': re.compile(r'\btag\b')}): - href = elm.get('href') - if not href: - continue - urlscheme, domain, path, params, query, fragment = \ - urlparse.urlparse(_urljoin(self.baseuri, href)) - segments = path.split('/') - tag = segments.pop() - if not tag: - if segments: - tag = segments.pop() - else: - # there are no tags - continue - tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) - if not tagscheme.endswith('/'): - tagscheme += '/' - self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''})) - - def findEnclosures(self): - all = lambda x: 1 - enclosure_match = re.compile(r'\benclosure\b') - for elm in self.document(all, {'href': re.compile(r'.+')}): - if not enclosure_match.search(elm.get('rel', u'')) and not self.isProbablyDownloadable(elm): - continue - if elm.attrMap not in self.enclosures: - self.enclosures.append(elm.attrMap) - if elm.string and not elm.get('title'): - self.enclosures[-1]['title'] = elm.string - - def findXFN(self): - all = lambda x: 1 - for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}): - rels = elm.get('rel', u'').split() - xfn_rels = [r for r in rels if r in self.known_xfn_relationships] - if xfn_rels: - self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string}) - -def _parseMicroformats(htmlSource, baseURI, encoding): - if not BeautifulSoup: - return - try: - p = _MicroformatsParser(htmlSource, baseURI, encoding) - except UnicodeEncodeError: - # sgmllib throws this exception when performing lookups of tags - # with non-ASCII characters in them. - return - p.vcard = p.findVCards(p.document) - p.findTags() - p.findEnclosures() - p.findXFN() - return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard} - class _RelativeURIResolver(_BaseHTMLProcessor): relative_uris = set([('a', 'href'), ('applet', 'codebase'), ('area', 'href'), + ('audio', 'src'), ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), @@ -2547,7 +2309,9 @@ class _RelativeURIResolver(_BaseHTMLProcessor): ('object', 'usemap'), ('q', 'cite'), ('script', 'src'), - ('video', 'poster')]) + ('source', 'src'), + ('video', 'poster'), + ('video', 'src')]) def __init__(self, baseuri, encoding, _type): _BaseHTMLProcessor.__init__(self, encoding, _type) @@ -2572,10 +2336,7 @@ def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): def _makeSafeAbsoluteURI(base, rel=None): # bail if ACCEPTABLE_URI_SCHEMES is empty if not ACCEPTABLE_URI_SCHEMES: - try: - return _urljoin(base, rel or u'') - except ValueError: - return u'' + return _urljoin(base, rel or u'') if not base: return rel or u'' if not rel: @@ -2586,10 +2347,7 @@ def _makeSafeAbsoluteURI(base, rel=None): if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: return base return u'' - try: - uri = _urljoin(base, rel) - except ValueError: - return u'' + uri = _urljoin(base, rel) if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: return u'' return uri @@ -2657,21 +2415,154 @@ class _HTMLSanitizer(_BaseHTMLProcessor): valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') - mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math', - 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', - 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', - 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', - 'munderover', 'none', 'semantics']) + mathml_elements = set([ + 'annotation', + 'annotation-xml', + 'maction', + 'maligngroup', + 'malignmark', + 'math', + 'menclose', + 'merror', + 'mfenced', + 'mfrac', + 'mglyph', + 'mi', + 'mlabeledtr', + 'mlongdiv', + 'mmultiscripts', + 'mn', + 'mo', + 'mover', + 'mpadded', + 'mphantom', + 'mprescripts', + 'mroot', + 'mrow', + 'ms', + 'mscarries', + 'mscarry', + 'msgroup', + 'msline', + 'mspace', + 'msqrt', + 'msrow', + 'mstack', + 'mstyle', + 'msub', + 'msubsup', + 'msup', + 'mtable', + 'mtd', + 'mtext', + 'mtr', + 'munder', + 'munderover', + 'none', + 'semantics', + ]) - mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign', - 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', - 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', - 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', - 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', - 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', - 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', - 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', - 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']) + mathml_attributes = set([ + 'accent', + 'accentunder', + 'actiontype', + 'align', + 'alignmentscope', + 'altimg', + 'altimg-height', + 'altimg-valign', + 'altimg-width', + 'alttext', + 'bevelled', + 'charalign', + 'close', + 'columnalign', + 'columnlines', + 'columnspacing', + 'columnspan', + 'columnwidth', + 'crossout', + 'decimalpoint', + 'denomalign', + 'depth', + 'dir', + 'display', + 'displaystyle', + 'edge', + 'encoding', + 'equalcolumns', + 'equalrows', + 'fence', + 'fontstyle', + 'fontweight', + 'form', + 'frame', + 'framespacing', + 'groupalign', + 'height', + 'href', + 'id', + 'indentalign', + 'indentalignfirst', + 'indentalignlast', + 'indentshift', + 'indentshiftfirst', + 'indentshiftlast', + 'indenttarget', + 'infixlinebreakstyle', + 'largeop', + 'length', + 'linebreak', + 'linebreakmultchar', + 'linebreakstyle', + 'lineleading', + 'linethickness', + 'location', + 'longdivstyle', + 'lquote', + 'lspace', + 'mathbackground', + 'mathcolor', + 'mathsize', + 'mathvariant', + 'maxsize', + 'minlabelspacing', + 'minsize', + 'movablelimits', + 'notation', + 'numalign', + 'open', + 'other', + 'overflow', + 'position', + 'rowalign', + 'rowlines', + 'rowspacing', + 'rowspan', + 'rquote', + 'rspace', + 'scriptlevel', + 'scriptminsize', + 'scriptsizemultiplier', + 'selection', + 'separator', + 'separators', + 'shift', + 'side', + 'src', + 'stackalign', + 'stretchy', + 'subscriptshift', + 'superscriptshift', + 'symmetric', + 'voffset', + 'width', + 'xlink:href', + 'xlink:show', + 'xlink:type', + 'xmlns', + 'xmlns:xlink', + ]) # svgtiny - foreignObject + linearGradient + radialGradient + stop svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion', @@ -2860,38 +2751,6 @@ def _sanitizeHTML(htmlSource, encoding, _type): htmlSource = htmlSource.replace(''): - data = data.split('>', 1)[1] - if data.count(' julian: - if diff < day: - day = day - diff - else: - month = month - 1 - day = 31 - elif jday < julian: - if day + diff < 28: - day = day + diff - else: - month = month + 1 - return year, month, day - month = m.group('month') - day = 1 - if month is None: - month = 1 - else: - month = int(month) - day = m.group('day') - if day: - day = int(day) - else: - day = 1 - return year, month, day - - def __extract_time(m): - if not m: - return 0, 0, 0 - hours = m.group('hours') - if not hours: - return 0, 0, 0 - hours = int(hours) - minutes = int(m.group('minutes')) - seconds = m.group('seconds') - if seconds: - seconds = int(seconds) - else: - seconds = 0 - return hours, minutes, seconds - - def __extract_tzd(m): - '''Return the Time Zone Designator as an offset in seconds from UTC.''' - if not m: - return 0 - tzd = m.group('tzd') - if not tzd: - return 0 - if tzd == 'Z': - return 0 - hours = int(m.group('tzdhours')) - minutes = m.group('tzdminutes') - if minutes: - minutes = int(minutes) - else: - minutes = 0 - offset = (hours*60 + minutes) * 60 - if tzd[0] == '+': - return -offset - return offset - - __date_re = ('(?P\d\d\d\d)' - '(?:(?P-|)' - '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?' - '|(?P\d\d\d)))?') - __tzd_re = ' ?(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)?' - __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?' - + __tzd_re) - __datetime_re = '%s(?:[T ]%s)?' % (__date_re, __time_re) - __datetime_rx = re.compile(__datetime_re) - m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): - return - gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: - return - return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) -registerDateHandler(_parse_date_w3dtf) - -# Define the strings used by the RFC822 datetime parser -_rfc822_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', - 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] -_rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] - -# Only the first three letters of the month name matter -_rfc822_month = "(?P%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months)) -# The year may be 2 or 4 digits; capture the century if it exists -_rfc822_year = "(?P(?:\d{2})?\d{2})" -_rfc822_day = "(?P *\d{1,2})" -_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year) - -_rfc822_hour = "(?P\d{2}):(?P\d{2})(?::(?P\d{2}))?" -_rfc822_tz = "(?Put|gmt(?:[+-]\d{2}:\d{2})?|[aecmp][sd]?t|[zamny]|[+-]\d{4})" -_rfc822_tznames = { +timezonenames = { 'ut': 0, 'gmt': 0, 'z': 0, 'adt': -3, 'ast': -4, 'at': -4, 'edt': -4, 'est': -5, 'et': -5, @@ -3468,86 +3224,206 @@ _rfc822_tznames = { 'pdt': -7, 'pst': -8, 'pt': -8, 'a': -1, 'n': 1, 'm': -12, 'y': 12, - } -# The timezone may be prefixed by 'Etc/' -_rfc822_time = "%s (?:etc/)?%s" % (_rfc822_hour, _rfc822_tz) - -_rfc822_dayname = "(?P%s)" % ('|'.join(_rfc822_daynames)) -_rfc822_match = re.compile( - "(?:%s, )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date, _rfc822_time) -).match - -def _parse_date_group_rfc822(m): - # Calculate a date and timestamp - for k in ('year', 'day', 'hour', 'minute', 'second'): - m[k] = int(m[k]) - m['month'] = _rfc822_months.index(m['month']) + 1 - # If the year is 2 digits, assume everything in the 90's is the 1990's - if m['year'] < 100: - m['year'] += (1900, 2000)[m['year'] < 90] - stamp = datetime.datetime(*[m[i] for i in - ('year', 'month', 'day', 'hour', 'minute', 'second')]) - - # Use the timezone information to calculate the difference between - # the given date and timestamp and Universal Coordinated Time +} +# W3 date and time format parser +# http://www.w3.org/TR/NOTE-datetime +# Also supports MSSQL-style datetimes as defined at: +# http://msdn.microsoft.com/en-us/library/ms186724.aspx +# (basically, allow a space as a date/time/timezone separator) +def _parse_date_w3dtf(datestr): + if not datestr.strip(): + return None + parts = datestr.lower().split('t') + if len(parts) == 1: + # This may be a date only, or may be an MSSQL-style date + parts = parts[0].split() + if len(parts) == 1: + # Treat this as a date only + parts.append('00:00:00z') + elif len(parts) > 2: + return None + date = parts[0].split('-', 2) + if not date or len(date[0]) != 4: + return None + # Ensure that `date` has 3 elements. Using '1' sets the default + # month to January and the default day to the 1st of the month. + date.extend(['1'] * (3 - len(date))) + try: + year, month, day = [int(i) for i in date] + except ValueError: + # `date` may have more than 3 elements or may contain + # non-integer strings. + return None + if parts[1].endswith('z'): + parts[1] = parts[1][:-1] + parts.append('z') + # Append the numeric timezone offset, if any, to parts. + # If this is an MSSQL-style date then parts[2] already contains + # the timezone information, so `append()` will not affect it. + # Add 1 to each value so that if `find()` returns -1 it will be + # treated as False. + loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 + loc = loc - 1 + parts.append(parts[1][loc:]) + parts[1] = parts[1][:loc] + time = parts[1].split(':', 2) + # Ensure that time has 3 elements. Using '0' means that the + # minutes and seconds, if missing, will default to 0. + time.extend(['0'] * (3 - len(time))) tzhour = 0 tzmin = 0 - if m['tz'] and m['tz'].startswith('gmt'): - # Handle GMT and GMT+hh:mm timezone syntax (the trailing - # timezone info will be handled by the next `if` block) - m['tz'] = ''.join(m['tz'][3:].split(':')) or 'gmt' - if not m['tz']: - pass - elif m['tz'].startswith('+'): - tzhour = int(m['tz'][1:3]) - tzmin = int(m['tz'][3:]) - elif m['tz'].startswith('-'): - tzhour = int(m['tz'][1:3]) * -1 - tzmin = int(m['tz'][3:]) * -1 + if parts[2][:1] in ('-', '+'): + try: + tzhour = int(parts[2][1:3]) + tzmin = int(parts[2][4:]) + except ValueError: + return None + if parts[2].startswith('-'): + tzhour = tzhour * -1 + tzmin = tzmin * -1 else: - tzhour = _rfc822_tznames[m['tz']] - delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) - - # Return the date and timestamp in UTC - return (stamp - delta).utctimetuple() - -def _parse_date_rfc822(dt): - """Parse RFC 822 dates and times, with one minor - difference: years may be 4DIGIT or 2DIGIT. - http://tools.ietf.org/html/rfc822#section-5""" + tzhour = timezonenames.get(parts[2], 0) try: - m = _rfc822_match(dt.lower()).groupdict(0) - except AttributeError: + hour, minute, second = [int(float(i)) for i in time] + except ValueError: + return None + # Create the datetime object and timezone delta objects + try: + stamp = datetime.datetime(year, month, day, hour, minute, second) + except ValueError: + return None + delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) + # Return the date and timestamp in a UTC 9-tuple + try: + return (stamp - delta).utctimetuple() + except (OverflowError, ValueError): + # IronPython throws ValueErrors instead of OverflowErrors return None - return _parse_date_group_rfc822(m) +registerDateHandler(_parse_date_w3dtf) + +def _parse_date_rfc822(date): + """Parse RFC 822 dates and times + http://tools.ietf.org/html/rfc822#section-5 + + There are some formatting differences that are accounted for: + 1. Years may be two or four digits. + 2. The month and day can be swapped. + 3. Additional timezone names are supported. + 4. A default time and timezone are assumed if only a date is present. + """ + daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']) + months = { + 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, + 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, + } + + parts = date.lower().split() + if len(parts) < 5: + # Assume that the time and timezone are missing + parts.extend(('00:00:00', '0000')) + # Remove the day name + if parts[0][:3] in daynames: + parts = parts[1:] + if len(parts) < 5: + # If there are still fewer than five parts, there's not enough + # information to interpret this + return None + try: + day = int(parts[0]) + except ValueError: + # Check if the day and month are swapped + if months.get(parts[0][:3]): + try: + day = int(parts[1]) + except ValueError: + return None + else: + parts[1] = parts[0] + else: + return None + month = months.get(parts[1][:3]) + if not month: + return None + try: + year = int(parts[2]) + except ValueError: + return None + # Normalize two-digit years: + # Anything in the 90's is interpreted as 1990 and on + # Anything 89 or less is interpreted as 2089 or before + if len(parts[2]) <= 2: + year += (1900, 2000)[year < 90] + timeparts = parts[3].split(':') + timeparts = timeparts + ([0] * (3 - len(timeparts))) + try: + (hour, minute, second) = map(int, timeparts) + except ValueError: + return None + tzhour = 0 + tzmin = 0 + # Strip 'Etc/' from the timezone + if parts[4].startswith('etc/'): + parts[4] = parts[4][4:] + # Normalize timezones that start with 'gmt': + # GMT-05:00 => -0500 + # GMT => GMT + if parts[4].startswith('gmt'): + parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' + # Handle timezones like '-0500', '+0500', and 'EST' + if parts[4] and parts[4][0] in ('-', '+'): + try: + tzhour = int(parts[4][1:3]) + tzmin = int(parts[4][3:]) + except ValueError: + return None + if parts[4].startswith('-'): + tzhour = tzhour * -1 + tzmin = tzmin * -1 + else: + tzhour = timezonenames.get(parts[4], 0) + # Create the datetime object and timezone delta objects + try: + stamp = datetime.datetime(year, month, day, hour, minute, second) + except ValueError: + return None + delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) + # Return the date and timestamp in a UTC 9-tuple + try: + return (stamp - delta).utctimetuple() + except (OverflowError, ValueError): + # IronPython throws ValueErrors instead of OverflowErrors + return None registerDateHandler(_parse_date_rfc822) -def _parse_date_rfc822_grubby(dt): - """Parse date format similar to RFC 822, but - the comma after the dayname is optional and - month/day are inverted""" - _rfc822_date_grubby = "%s %s %s" % (_rfc822_month, _rfc822_day, _rfc822_year) - _rfc822_match_grubby = re.compile( - "(?:%s[,]? )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date_grubby, _rfc822_time) - ).match +_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', + 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] +def _parse_date_asctime(dt): + """Parse asctime-style dates. - try: - m = _rfc822_match_grubby(dt.lower()).groupdict(0) - except AttributeError: + Converts asctime to RFC822-compatible dates and uses the RFC822 parser + to do the actual parsing. + + Supported formats (format is standardized to the first one listed): + + * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy + * {weekday name} {month name} dd hh:mm:ss yyyy + """ + + parts = dt.split() + + # Insert a GMT timezone, if needed. + if len(parts) == 5: + parts.insert(4, '+0000') + + # Exit if there are not six parts. + if len(parts) != 6: return None - return _parse_date_group_rfc822(m) -registerDateHandler(_parse_date_rfc822_grubby) - -def _parse_date_asctime(dt): - """Parse asctime-style dates""" - dayname, month, day, remainder = dt.split(None, 3) - # Convert month and day into zero-padded integers - month = '%02i ' % (_rfc822_months.index(month.lower()) + 1) - day = '%02i ' % (int(day),) - dt = month + day + remainder - return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, ) + # Reassemble the parts in an RFC822-compatible order and parse them. + return _parse_date_rfc822(' '.join([ + parts[0], parts[2], parts[1], parts[5], parts[3], parts[4], + ])) registerDateHandler(_parse_date_asctime) def _parse_date_perforce(aDateString): @@ -3725,7 +3601,7 @@ def convert_to_utf8(http_headers, data): u'application/xml-external-parsed-entity') text_content_types = (u'text/xml', u'text/xml-external-parsed-entity') if (http_content_type in application_content_types) or \ - (http_content_type.startswith(u'application/') and + (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')): acceptable_content_type = 1 rfc3023_encoding = http_encoding or xml_encoding or u'utf-8' @@ -3763,13 +3639,21 @@ def convert_to_utf8(http_headers, data): # determine character encoding known_encoding = 0 - chardet_encoding = None + lazy_chardet_encoding = None tried_encodings = [] if chardet: - chardet_encoding = unicode(chardet.detect(data)['encoding'] or '', 'ascii', 'ignore') + def lazy_chardet_encoding(): + chardet_encoding = chardet.detect(data)['encoding'] + if not chardet_encoding: + chardet_encoding = '' + if not isinstance(chardet_encoding, unicode): + chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore') + return chardet_encoding # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding, - chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'): + lazy_chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'): + if callable(proposed_encoding): + proposed_encoding = proposed_encoding() if not proposed_encoding: continue if proposed_encoding in tried_encodings: @@ -3861,11 +3745,83 @@ def replace_doctype(data): for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) return version, data, safe_entities + +# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' +# items, or None in the case of a parsing error. + +def _parse_poslist(value, geom_type, swap=True, dims=2): + if geom_type == 'linestring': + return _parse_georss_line(value, swap, dims) + elif geom_type == 'polygon': + ring = _parse_georss_line(value, swap, dims) + return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)} + else: + return None + +def _gen_georss_coords(value, swap=True, dims=2): + # A generator of (lon, lat) pairs from a string of encoded GeoRSS + # coordinates. Converts to floats and swaps order. + latlons = itertools.imap(float, value.strip().replace(',', ' ').split()) + nxt = latlons.next + while True: + t = [nxt(), nxt()][::swap and -1 or 1] + if dims == 3: + t.append(nxt()) + yield tuple(t) + +def _parse_georss_point(value, swap=True, dims=2): + # A point contains a single latitude-longitude pair, separated by + # whitespace. We'll also handle comma separators. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {u'type': u'Point', u'coordinates': coords[0]} + except (IndexError, ValueError): + return None + +def _parse_georss_line(value, swap=True, dims=2): + # A line contains a space separated list of latitude-longitude pairs in + # WGS84 coordinate reference system, with each pair separated by + # whitespace. There must be at least two pairs. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {u'type': u'LineString', u'coordinates': coords} + except (IndexError, ValueError): + return None + +def _parse_georss_polygon(value, swap=True, dims=2): + # A polygon contains a space separated list of latitude-longitude pairs, + # with each pair separated by whitespace. There must be at least four + # pairs, with the last being identical to the first (so a polygon has a + # minimum of three actual points). + try: + ring = list(_gen_georss_coords(value, swap, dims)) + except (IndexError, ValueError): + return None + if len(ring) < 4: + return None + return {u'type': u'Polygon', u'coordinates': (ring,)} + +def _parse_georss_box(value, swap=True, dims=2): + # A bounding box is a rectangular region, often used to define the extents + # of a map or a rough area of interest. A box contains two space seperate + # latitude-longitude pairs, with each pair separated by whitespace. The + # first pair is the lower corner, the second is the upper corner. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {u'type': u'Box', u'coordinates': tuple(coords)} + except (IndexError, ValueError): + return None + +# end geospatial parsers + + def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): '''Parse a feed from a URL, file, stream, or string. request_headers, if given, is a dict from http header name to value to add to the request; this overrides internally generated values. + + :return: A :class:`FeedParserDict`. ''' if handlers is None: @@ -4011,3 +3967,41 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['version'] = result['version'] or feedparser.version result['namespaces'] = feedparser.namespacesInUse return result + +# The list of EPSG codes for geographic (latitude/longitude) coordinate +# systems to support decoding of GeoRSS GML profiles. +_geogCS = [ +3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, +4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022, +4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, +4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081, +4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, +4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, +4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, +4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, +4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, +4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, +4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, +4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, +4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, +4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, +4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, +4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, +4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293, +4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307, +4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322, +4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603, +4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, +4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, +4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, +4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, +4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, +4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, +4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, +4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, +4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, +4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, +4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, +4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804, +4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818, +4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]